xref: /qemu/target/arm/tcg/vec_helper.c (revision fdf89638dc1ca42222685d06dc0c465e4735cf44)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
883     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
884     uintptr_t i;
885 
886     for (i = 0; i < opr_sz / 2; i += 2) {
887         float16 e0 = n[H2(i)];
888         float16 e1 = m[H2(i + 1)];
889         float16 e2 = n[H2(i + 1)];
890         float16 e3 = m[H2(i)];
891 
892         if (rot) {
893             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
894         } else {
895             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
896         }
897 
898         d[H2(i)] = float16_add(e0, e1, fpst);
899         d[H2(i + 1)] = float16_add(e2, e3, fpst);
900     }
901     clear_tail(d, opr_sz, simd_maxsz(desc));
902 }
903 
904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
905                          float_status *fpst, uint32_t desc)
906 {
907     uintptr_t opr_sz = simd_oprsz(desc);
908     float32 *d = vd;
909     float32 *n = vn;
910     float32 *m = vm;
911     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
912     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
913     uintptr_t i;
914 
915     for (i = 0; i < opr_sz / 4; i += 2) {
916         float32 e0 = n[H4(i)];
917         float32 e1 = m[H4(i + 1)];
918         float32 e2 = n[H4(i + 1)];
919         float32 e3 = m[H4(i)];
920 
921         if (rot) {
922             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
923         } else {
924             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
925         }
926 
927         d[H4(i)] = float32_add(e0, e1, fpst);
928         d[H4(i + 1)] = float32_add(e2, e3, fpst);
929     }
930     clear_tail(d, opr_sz, simd_maxsz(desc));
931 }
932 
933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
934                          float_status *fpst, uint32_t desc)
935 {
936     uintptr_t opr_sz = simd_oprsz(desc);
937     float64 *d = vd;
938     float64 *n = vn;
939     float64 *m = vm;
940     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
941     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
942     uintptr_t i;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1];
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i];
949 
950         if (rot) {
951             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
952         } else {
953             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
954         }
955 
956         d[i] = float64_add(e0, e1, fpst);
957         d[i + 1] = float64_add(e2, e3, fpst);
958     }
959     clear_tail(d, opr_sz, simd_maxsz(desc));
960 }
961 
962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
963                          float_status *fpst, uint32_t desc)
964 {
965     uintptr_t opr_sz = simd_oprsz(desc);
966     float16 *d = vd, *n = vn, *m = vm, *a = va;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
969     uint32_t neg_real = flip ^ neg_imag;
970     uintptr_t i;
971 
972     /* Shift boolean to the sign bit so we can xor to negate.  */
973     neg_real <<= 15;
974     neg_imag <<= 15;
975 
976     for (i = 0; i < opr_sz / 2; i += 2) {
977         float16 e2 = n[H2(i + flip)];
978         float16 e1 = m[H2(i + flip)] ^ neg_real;
979         float16 e4 = e2;
980         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
981 
982         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
983         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
984     }
985     clear_tail(d, opr_sz, simd_maxsz(desc));
986 }
987 
988 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
989                              float_status *fpst, uint32_t desc)
990 {
991     uintptr_t opr_sz = simd_oprsz(desc);
992     float16 *d = vd, *n = vn, *m = vm, *a = va;
993     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
994     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
995     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
996     uint32_t neg_real = flip ^ neg_imag;
997     intptr_t elements = opr_sz / sizeof(float16);
998     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
999     intptr_t i, j;
1000 
1001     /* Shift boolean to the sign bit so we can xor to negate.  */
1002     neg_real <<= 15;
1003     neg_imag <<= 15;
1004 
1005     for (i = 0; i < elements; i += eltspersegment) {
1006         float16 mr = m[H2(i + 2 * index + 0)];
1007         float16 mi = m[H2(i + 2 * index + 1)];
1008         float16 e1 = neg_real ^ (flip ? mi : mr);
1009         float16 e3 = neg_imag ^ (flip ? mr : mi);
1010 
1011         for (j = i; j < i + eltspersegment; j += 2) {
1012             float16 e2 = n[H2(j + flip)];
1013             float16 e4 = e2;
1014 
1015             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1016             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1017         }
1018     }
1019     clear_tail(d, opr_sz, simd_maxsz(desc));
1020 }
1021 
1022 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1023                          float_status *fpst, uint32_t desc)
1024 {
1025     uintptr_t opr_sz = simd_oprsz(desc);
1026     float32 *d = vd, *n = vn, *m = vm, *a = va;
1027     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1028     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1029     uint32_t neg_real = flip ^ neg_imag;
1030     uintptr_t i;
1031 
1032     /* Shift boolean to the sign bit so we can xor to negate.  */
1033     neg_real <<= 31;
1034     neg_imag <<= 31;
1035 
1036     for (i = 0; i < opr_sz / 4; i += 2) {
1037         float32 e2 = n[H4(i + flip)];
1038         float32 e1 = m[H4(i + flip)] ^ neg_real;
1039         float32 e4 = e2;
1040         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1041 
1042         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1043         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1044     }
1045     clear_tail(d, opr_sz, simd_maxsz(desc));
1046 }
1047 
1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1049                              float_status *fpst, uint32_t desc)
1050 {
1051     uintptr_t opr_sz = simd_oprsz(desc);
1052     float32 *d = vd, *n = vn, *m = vm, *a = va;
1053     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1054     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1055     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1056     uint32_t neg_real = flip ^ neg_imag;
1057     intptr_t elements = opr_sz / sizeof(float32);
1058     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1059     intptr_t i, j;
1060 
1061     /* Shift boolean to the sign bit so we can xor to negate.  */
1062     neg_real <<= 31;
1063     neg_imag <<= 31;
1064 
1065     for (i = 0; i < elements; i += eltspersegment) {
1066         float32 mr = m[H4(i + 2 * index + 0)];
1067         float32 mi = m[H4(i + 2 * index + 1)];
1068         float32 e1 = neg_real ^ (flip ? mi : mr);
1069         float32 e3 = neg_imag ^ (flip ? mr : mi);
1070 
1071         for (j = i; j < i + eltspersegment; j += 2) {
1072             float32 e2 = n[H4(j + flip)];
1073             float32 e4 = e2;
1074 
1075             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1076             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1077         }
1078     }
1079     clear_tail(d, opr_sz, simd_maxsz(desc));
1080 }
1081 
1082 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1083                          float_status *fpst, uint32_t desc)
1084 {
1085     uintptr_t opr_sz = simd_oprsz(desc);
1086     float64 *d = vd, *n = vn, *m = vm, *a = va;
1087     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1088     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1089     uint64_t neg_real = flip ^ neg_imag;
1090     uintptr_t i;
1091 
1092     /* Shift boolean to the sign bit so we can xor to negate.  */
1093     neg_real <<= 63;
1094     neg_imag <<= 63;
1095 
1096     for (i = 0; i < opr_sz / 8; i += 2) {
1097         float64 e2 = n[i + flip];
1098         float64 e1 = m[i + flip] ^ neg_real;
1099         float64 e4 = e2;
1100         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1101 
1102         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1103         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1104     }
1105     clear_tail(d, opr_sz, simd_maxsz(desc));
1106 }
1107 
1108 /*
1109  * Floating point comparisons producing an integer result (all 1s or all 0s).
1110  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1111  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1112  */
1113 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1114 {
1115     return -float16_eq_quiet(op1, op2, stat);
1116 }
1117 
1118 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1119 {
1120     return -float32_eq_quiet(op1, op2, stat);
1121 }
1122 
1123 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1124 {
1125     return -float64_eq_quiet(op1, op2, stat);
1126 }
1127 
1128 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1129 {
1130     return -float16_le(op2, op1, stat);
1131 }
1132 
1133 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1134 {
1135     return -float32_le(op2, op1, stat);
1136 }
1137 
1138 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1139 {
1140     return -float64_le(op2, op1, stat);
1141 }
1142 
1143 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1144 {
1145     return -float16_lt(op2, op1, stat);
1146 }
1147 
1148 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1149 {
1150     return -float32_lt(op2, op1, stat);
1151 }
1152 
1153 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1154 {
1155     return -float64_lt(op2, op1, stat);
1156 }
1157 
1158 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1159 {
1160     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1161 }
1162 
1163 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1164 {
1165     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1166 }
1167 
1168 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1169 {
1170     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1171 }
1172 
1173 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1174 {
1175     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1176 }
1177 
1178 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1179 {
1180     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1181 }
1182 
1183 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1184 {
1185     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1186 }
1187 
1188 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1189 {
1190     if (float16_is_any_nan(x)) {
1191         float_raise(float_flag_invalid, fpst);
1192         return 0;
1193     }
1194     return float16_to_int16_round_to_zero(x, fpst);
1195 }
1196 
1197 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1198 {
1199     if (float16_is_any_nan(x)) {
1200         float_raise(float_flag_invalid, fpst);
1201         return 0;
1202     }
1203     return float16_to_uint16_round_to_zero(x, fpst);
1204 }
1205 
1206 #define DO_2OP(NAME, FUNC, TYPE) \
1207 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1208 {                                                                 \
1209     intptr_t i, oprsz = simd_oprsz(desc);                         \
1210     TYPE *d = vd, *n = vn;                                        \
1211     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1212         d[i] = FUNC(n[i], stat);                                  \
1213     }                                                             \
1214     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1215 }
1216 
1217 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1218 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1219 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1220 
1221 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1222 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1223 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1224 
1225 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1226 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1227 
1228 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1229 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1230 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1231 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1232 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1233 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1234 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1235 DO_2OP(gvec_touszh, vfp_touszh, float16)
1236 
1237 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1238     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1239     {                                                           \
1240         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1241     }
1242 
1243 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1244     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1245     {                                                           \
1246         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1247     }
1248 
1249 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1250     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1251     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1252     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1253     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1254     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1255     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1256 
1257 DO_2OP_CMP0(cgt, cgt, FWD)
1258 DO_2OP_CMP0(cge, cge, FWD)
1259 DO_2OP_CMP0(ceq, ceq, FWD)
1260 DO_2OP_CMP0(clt, cgt, REV)
1261 DO_2OP_CMP0(cle, cge, REV)
1262 
1263 #undef DO_2OP
1264 #undef DO_2OP_CMP0
1265 
1266 /* Floating-point trigonometric starting value.
1267  * See the ARM ARM pseudocode function FPTrigSMul.
1268  */
1269 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1270 {
1271     float16 result = float16_mul(op1, op1, stat);
1272     if (!float16_is_any_nan(result)) {
1273         result = float16_set_sign(result, op2 & 1);
1274     }
1275     return result;
1276 }
1277 
1278 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1279 {
1280     float32 result = float32_mul(op1, op1, stat);
1281     if (!float32_is_any_nan(result)) {
1282         result = float32_set_sign(result, op2 & 1);
1283     }
1284     return result;
1285 }
1286 
1287 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1288 {
1289     float64 result = float64_mul(op1, op1, stat);
1290     if (!float64_is_any_nan(result)) {
1291         result = float64_set_sign(result, op2 & 1);
1292     }
1293     return result;
1294 }
1295 
1296 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1297 {
1298     return float16_abs(float16_sub(op1, op2, stat));
1299 }
1300 
1301 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1302 {
1303     return float32_abs(float32_sub(op1, op2, stat));
1304 }
1305 
1306 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1307 {
1308     return float64_abs(float64_sub(op1, op2, stat));
1309 }
1310 
1311 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1312 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1313 {
1314     float16 r = float16_sub(op1, op2, stat);
1315     return float16_is_any_nan(r) ? r : float16_abs(r);
1316 }
1317 
1318 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1319 {
1320     float32 r = float32_sub(op1, op2, stat);
1321     return float32_is_any_nan(r) ? r : float32_abs(r);
1322 }
1323 
1324 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1325 {
1326     float64 r = float64_sub(op1, op2, stat);
1327     return float64_is_any_nan(r) ? r : float64_abs(r);
1328 }
1329 
1330 /*
1331  * Reciprocal step. These are the AArch32 version which uses a
1332  * non-fused multiply-and-subtract.
1333  */
1334 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1335 {
1336     op1 = float16_squash_input_denormal(op1, stat);
1337     op2 = float16_squash_input_denormal(op2, stat);
1338 
1339     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1340         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1341         return float16_two;
1342     }
1343     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1344 }
1345 
1346 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1347 {
1348     op1 = float32_squash_input_denormal(op1, stat);
1349     op2 = float32_squash_input_denormal(op2, stat);
1350 
1351     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1352         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1353         return float32_two;
1354     }
1355     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1356 }
1357 
1358 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1359 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1360 {
1361     op1 = float16_squash_input_denormal(op1, stat);
1362     op2 = float16_squash_input_denormal(op2, stat);
1363 
1364     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1365         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1366         return float16_one_point_five;
1367     }
1368     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1369     return float16_div(op1, float16_two, stat);
1370 }
1371 
1372 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1373 {
1374     op1 = float32_squash_input_denormal(op1, stat);
1375     op2 = float32_squash_input_denormal(op2, stat);
1376 
1377     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1378         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1379         return float32_one_point_five;
1380     }
1381     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1382     return float32_div(op1, float32_two, stat);
1383 }
1384 
1385 #define DO_3OP(NAME, FUNC, TYPE) \
1386 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1387                   float_status *stat, uint32_t desc)                       \
1388 {                                                                          \
1389     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1390     TYPE *d = vd, *n = vn, *m = vm;                                        \
1391     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1392         d[i] = FUNC(n[i], m[i], stat);                                     \
1393     }                                                                      \
1394     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1395 }
1396 
1397 DO_3OP(gvec_fadd_h, float16_add, float16)
1398 DO_3OP(gvec_fadd_s, float32_add, float32)
1399 DO_3OP(gvec_fadd_d, float64_add, float64)
1400 
1401 DO_3OP(gvec_fsub_h, float16_sub, float16)
1402 DO_3OP(gvec_fsub_s, float32_sub, float32)
1403 DO_3OP(gvec_fsub_d, float64_sub, float64)
1404 
1405 DO_3OP(gvec_fmul_h, float16_mul, float16)
1406 DO_3OP(gvec_fmul_s, float32_mul, float32)
1407 DO_3OP(gvec_fmul_d, float64_mul, float64)
1408 
1409 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1410 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1411 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1412 
1413 DO_3OP(gvec_fabd_h, float16_abd, float16)
1414 DO_3OP(gvec_fabd_s, float32_abd, float32)
1415 DO_3OP(gvec_fabd_d, float64_abd, float64)
1416 
1417 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1418 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1419 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1420 
1421 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1422 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1423 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1424 
1425 DO_3OP(gvec_fcge_h, float16_cge, float16)
1426 DO_3OP(gvec_fcge_s, float32_cge, float32)
1427 DO_3OP(gvec_fcge_d, float64_cge, float64)
1428 
1429 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1430 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1431 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1432 
1433 DO_3OP(gvec_facge_h, float16_acge, float16)
1434 DO_3OP(gvec_facge_s, float32_acge, float32)
1435 DO_3OP(gvec_facge_d, float64_acge, float64)
1436 
1437 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1438 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1439 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1440 
1441 DO_3OP(gvec_fmax_h, float16_max, float16)
1442 DO_3OP(gvec_fmax_s, float32_max, float32)
1443 DO_3OP(gvec_fmax_d, float64_max, float64)
1444 
1445 DO_3OP(gvec_fmin_h, float16_min, float16)
1446 DO_3OP(gvec_fmin_s, float32_min, float32)
1447 DO_3OP(gvec_fmin_d, float64_min, float64)
1448 
1449 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1450 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1451 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1452 
1453 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1454 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1455 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1456 
1457 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1458 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1459 
1460 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1461 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1462 
1463 #ifdef TARGET_AARCH64
1464 DO_3OP(gvec_fdiv_h, float16_div, float16)
1465 DO_3OP(gvec_fdiv_s, float32_div, float32)
1466 DO_3OP(gvec_fdiv_d, float64_div, float64)
1467 
1468 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1469 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1470 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1471 
1472 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1473 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1474 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1475 
1476 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1477 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1478 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1479 
1480 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1481 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1482 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1483 
1484 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1485 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1486 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1487 
1488 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1489 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1490 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1491 
1492 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1493 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1494 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1495 
1496 #endif
1497 #undef DO_3OP
1498 
1499 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1500 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1501                                  float_status *stat)
1502 {
1503     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1504 }
1505 
1506 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1507                                  float_status *stat)
1508 {
1509     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1510 }
1511 
1512 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1513                                  float_status *stat)
1514 {
1515     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1516 }
1517 
1518 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1519                                  float_status *stat)
1520 {
1521     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1522 }
1523 
1524 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1525 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1526                                 float_status *stat)
1527 {
1528     return float16_muladd(op1, op2, dest, 0, stat);
1529 }
1530 
1531 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1532                                  float_status *stat)
1533 {
1534     return float32_muladd(op1, op2, dest, 0, stat);
1535 }
1536 
1537 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1538                                  float_status *stat)
1539 {
1540     return float64_muladd(op1, op2, dest, 0, stat);
1541 }
1542 
1543 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1544                                  float_status *stat)
1545 {
1546     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1547 }
1548 
1549 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1550                                  float_status *stat)
1551 {
1552     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1553 }
1554 
1555 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1556                                  float_status *stat)
1557 {
1558     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1559 }
1560 
1561 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1562 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1563                   float_status *stat, uint32_t desc)                       \
1564 {                                                                          \
1565     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1566     TYPE *d = vd, *n = vn, *m = vm;                                        \
1567     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1568         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1569     }                                                                      \
1570     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1571 }
1572 
1573 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1574 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1575 
1576 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1577 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1578 
1579 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1580 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1581 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1582 
1583 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1584 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1585 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1586 
1587 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1588  * For AdvSIMD, there is of course only one such vector segment.
1589  */
1590 
1591 #define DO_MUL_IDX(NAME, TYPE, H) \
1592 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1593 {                                                                          \
1594     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1595     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1596     intptr_t idx = simd_data(desc);                                        \
1597     TYPE *d = vd, *n = vn, *m = vm;                                        \
1598     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1599         TYPE mm = m[H(i + idx)];                                           \
1600         for (j = 0; j < segment; j++) {                                    \
1601             d[i + j] = n[i + j] * mm;                                      \
1602         }                                                                  \
1603     }                                                                      \
1604     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1605 }
1606 
1607 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1608 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1609 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1610 
1611 #undef DO_MUL_IDX
1612 
1613 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1614 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1615 {                                                                          \
1616     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1617     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1618     intptr_t idx = simd_data(desc);                                        \
1619     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1620     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1621         TYPE mm = m[H(i + idx)];                                           \
1622         for (j = 0; j < segment; j++) {                                    \
1623             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1624         }                                                                  \
1625     }                                                                      \
1626     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1627 }
1628 
1629 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1630 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1631 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1632 
1633 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1634 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1635 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1636 
1637 #undef DO_MLA_IDX
1638 
1639 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1640 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1641                   float_status *stat, uint32_t desc)                       \
1642 {                                                                          \
1643     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1644     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1645     intptr_t idx = simd_data(desc);                                        \
1646     TYPE *d = vd, *n = vn, *m = vm;                                        \
1647     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1648         TYPE mm = m[H(i + idx)];                                           \
1649         for (j = 0; j < segment; j++) {                                    \
1650             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1651         }                                                                  \
1652     }                                                                      \
1653     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1654 }
1655 
1656 #define nop(N, M, S) (M)
1657 
1658 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1659 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1660 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1661 
1662 #ifdef TARGET_AARCH64
1663 
1664 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1665 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1666 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1667 
1668 #endif
1669 
1670 #undef nop
1671 
1672 /*
1673  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1674  * the fused ops below they assume accumulate both from and into Vd.
1675  */
1676 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1677 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1678 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1679 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1680 
1681 #undef DO_FMUL_IDX
1682 
1683 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1684 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1685                   float_status *stat, uint32_t desc)                       \
1686 {                                                                          \
1687     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1688     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1689     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1690     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1691     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1692     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1693     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1694         TYPE mm = m[H(i + idx)];                                           \
1695         for (j = 0; j < segment; j++) {                                    \
1696             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1697                                      mm, a[i + j], 0, stat);               \
1698         }                                                                  \
1699     }                                                                      \
1700     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1701 }
1702 
1703 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1704 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1705 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1706 
1707 #undef DO_FMLA_IDX
1708 
1709 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1710 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1711 {                                                                          \
1712     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1713     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1714     bool q = false;                                                        \
1715     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1716         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1717         if (dd < MIN) {                                                    \
1718             dd = MIN;                                                      \
1719             q = true;                                                      \
1720         } else if (dd > MAX) {                                             \
1721             dd = MAX;                                                      \
1722             q = true;                                                      \
1723         }                                                                  \
1724         d[i] = dd;                                                         \
1725     }                                                                      \
1726     if (q) {                                                               \
1727         uint32_t *qc = vq;                                                 \
1728         qc[0] = 1;                                                         \
1729     }                                                                      \
1730     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1731 }
1732 
1733 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1734 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1735 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1736 
1737 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1738 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1739 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1740 
1741 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1742 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1743 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1744 
1745 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1746 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1747 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1748 
1749 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1750 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1751 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1752 
1753 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1754 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1755 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1756 
1757 #undef DO_SAT
1758 
1759 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1760                           void *vm, uint32_t desc)
1761 {
1762     intptr_t i, oprsz = simd_oprsz(desc);
1763     uint64_t *d = vd, *n = vn, *m = vm;
1764     bool q = false;
1765 
1766     for (i = 0; i < oprsz / 8; i++) {
1767         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1768         if (dd < nn) {
1769             dd = UINT64_MAX;
1770             q = true;
1771         }
1772         d[i] = dd;
1773     }
1774     if (q) {
1775         uint32_t *qc = vq;
1776         qc[0] = 1;
1777     }
1778     clear_tail(d, oprsz, simd_maxsz(desc));
1779 }
1780 
1781 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1782                           void *vm, uint32_t desc)
1783 {
1784     intptr_t i, oprsz = simd_oprsz(desc);
1785     uint64_t *d = vd, *n = vn, *m = vm;
1786     bool q = false;
1787 
1788     for (i = 0; i < oprsz / 8; i++) {
1789         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1790         if (nn < mm) {
1791             dd = 0;
1792             q = true;
1793         }
1794         d[i] = dd;
1795     }
1796     if (q) {
1797         uint32_t *qc = vq;
1798         qc[0] = 1;
1799     }
1800     clear_tail(d, oprsz, simd_maxsz(desc));
1801 }
1802 
1803 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1804                           void *vm, uint32_t desc)
1805 {
1806     intptr_t i, oprsz = simd_oprsz(desc);
1807     int64_t *d = vd, *n = vn, *m = vm;
1808     bool q = false;
1809 
1810     for (i = 0; i < oprsz / 8; i++) {
1811         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1812         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1813             dd = (nn >> 63) ^ ~INT64_MIN;
1814             q = true;
1815         }
1816         d[i] = dd;
1817     }
1818     if (q) {
1819         uint32_t *qc = vq;
1820         qc[0] = 1;
1821     }
1822     clear_tail(d, oprsz, simd_maxsz(desc));
1823 }
1824 
1825 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1826                           void *vm, uint32_t desc)
1827 {
1828     intptr_t i, oprsz = simd_oprsz(desc);
1829     int64_t *d = vd, *n = vn, *m = vm;
1830     bool q = false;
1831 
1832     for (i = 0; i < oprsz / 8; i++) {
1833         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1834         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1835             dd = (nn >> 63) ^ ~INT64_MIN;
1836             q = true;
1837         }
1838         d[i] = dd;
1839     }
1840     if (q) {
1841         uint32_t *qc = vq;
1842         qc[0] = 1;
1843     }
1844     clear_tail(d, oprsz, simd_maxsz(desc));
1845 }
1846 
1847 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1848                            void *vm, uint32_t desc)
1849 {
1850     intptr_t i, oprsz = simd_oprsz(desc);
1851     uint64_t *d = vd, *n = vn, *m = vm;
1852     bool q = false;
1853 
1854     for (i = 0; i < oprsz / 8; i++) {
1855         uint64_t nn = n[i];
1856         int64_t mm = m[i];
1857         uint64_t dd = nn + mm;
1858 
1859         if (mm < 0) {
1860             if (nn < (uint64_t)-mm) {
1861                 dd = 0;
1862                 q = true;
1863             }
1864         } else {
1865             if (dd < nn) {
1866                 dd = UINT64_MAX;
1867                 q = true;
1868             }
1869         }
1870         d[i] = dd;
1871     }
1872     if (q) {
1873         uint32_t *qc = vq;
1874         qc[0] = 1;
1875     }
1876     clear_tail(d, oprsz, simd_maxsz(desc));
1877 }
1878 
1879 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1880                            void *vm, uint32_t desc)
1881 {
1882     intptr_t i, oprsz = simd_oprsz(desc);
1883     uint64_t *d = vd, *n = vn, *m = vm;
1884     bool q = false;
1885 
1886     for (i = 0; i < oprsz / 8; i++) {
1887         int64_t nn = n[i];
1888         uint64_t mm = m[i];
1889         int64_t dd = nn + mm;
1890 
1891         if (mm > (uint64_t)(INT64_MAX - nn)) {
1892             dd = INT64_MAX;
1893             q = true;
1894         }
1895         d[i] = dd;
1896     }
1897     if (q) {
1898         uint32_t *qc = vq;
1899         qc[0] = 1;
1900     }
1901     clear_tail(d, oprsz, simd_maxsz(desc));
1902 }
1903 
1904 #define DO_SRA(NAME, TYPE)                              \
1905 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1906 {                                                       \
1907     intptr_t i, oprsz = simd_oprsz(desc);               \
1908     int shift = simd_data(desc);                        \
1909     TYPE *d = vd, *n = vn;                              \
1910     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1911         d[i] += n[i] >> shift;                          \
1912     }                                                   \
1913     clear_tail(d, oprsz, simd_maxsz(desc));             \
1914 }
1915 
1916 DO_SRA(gvec_ssra_b, int8_t)
1917 DO_SRA(gvec_ssra_h, int16_t)
1918 DO_SRA(gvec_ssra_s, int32_t)
1919 DO_SRA(gvec_ssra_d, int64_t)
1920 
1921 DO_SRA(gvec_usra_b, uint8_t)
1922 DO_SRA(gvec_usra_h, uint16_t)
1923 DO_SRA(gvec_usra_s, uint32_t)
1924 DO_SRA(gvec_usra_d, uint64_t)
1925 
1926 #undef DO_SRA
1927 
1928 #define DO_RSHR(NAME, TYPE)                             \
1929 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1930 {                                                       \
1931     intptr_t i, oprsz = simd_oprsz(desc);               \
1932     int shift = simd_data(desc);                        \
1933     TYPE *d = vd, *n = vn;                              \
1934     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1935         TYPE tmp = n[i] >> (shift - 1);                 \
1936         d[i] = (tmp >> 1) + (tmp & 1);                  \
1937     }                                                   \
1938     clear_tail(d, oprsz, simd_maxsz(desc));             \
1939 }
1940 
1941 DO_RSHR(gvec_srshr_b, int8_t)
1942 DO_RSHR(gvec_srshr_h, int16_t)
1943 DO_RSHR(gvec_srshr_s, int32_t)
1944 DO_RSHR(gvec_srshr_d, int64_t)
1945 
1946 DO_RSHR(gvec_urshr_b, uint8_t)
1947 DO_RSHR(gvec_urshr_h, uint16_t)
1948 DO_RSHR(gvec_urshr_s, uint32_t)
1949 DO_RSHR(gvec_urshr_d, uint64_t)
1950 
1951 #undef DO_RSHR
1952 
1953 #define DO_RSRA(NAME, TYPE)                             \
1954 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1955 {                                                       \
1956     intptr_t i, oprsz = simd_oprsz(desc);               \
1957     int shift = simd_data(desc);                        \
1958     TYPE *d = vd, *n = vn;                              \
1959     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1960         TYPE tmp = n[i] >> (shift - 1);                 \
1961         d[i] += (tmp >> 1) + (tmp & 1);                 \
1962     }                                                   \
1963     clear_tail(d, oprsz, simd_maxsz(desc));             \
1964 }
1965 
1966 DO_RSRA(gvec_srsra_b, int8_t)
1967 DO_RSRA(gvec_srsra_h, int16_t)
1968 DO_RSRA(gvec_srsra_s, int32_t)
1969 DO_RSRA(gvec_srsra_d, int64_t)
1970 
1971 DO_RSRA(gvec_ursra_b, uint8_t)
1972 DO_RSRA(gvec_ursra_h, uint16_t)
1973 DO_RSRA(gvec_ursra_s, uint32_t)
1974 DO_RSRA(gvec_ursra_d, uint64_t)
1975 
1976 #undef DO_RSRA
1977 
1978 #define DO_SRI(NAME, TYPE)                              \
1979 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1980 {                                                       \
1981     intptr_t i, oprsz = simd_oprsz(desc);               \
1982     int shift = simd_data(desc);                        \
1983     TYPE *d = vd, *n = vn;                              \
1984     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1985         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1986     }                                                   \
1987     clear_tail(d, oprsz, simd_maxsz(desc));             \
1988 }
1989 
1990 DO_SRI(gvec_sri_b, uint8_t)
1991 DO_SRI(gvec_sri_h, uint16_t)
1992 DO_SRI(gvec_sri_s, uint32_t)
1993 DO_SRI(gvec_sri_d, uint64_t)
1994 
1995 #undef DO_SRI
1996 
1997 #define DO_SLI(NAME, TYPE)                              \
1998 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1999 {                                                       \
2000     intptr_t i, oprsz = simd_oprsz(desc);               \
2001     int shift = simd_data(desc);                        \
2002     TYPE *d = vd, *n = vn;                              \
2003     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2004         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2005     }                                                   \
2006     clear_tail(d, oprsz, simd_maxsz(desc));             \
2007 }
2008 
2009 DO_SLI(gvec_sli_b, uint8_t)
2010 DO_SLI(gvec_sli_h, uint16_t)
2011 DO_SLI(gvec_sli_s, uint32_t)
2012 DO_SLI(gvec_sli_d, uint64_t)
2013 
2014 #undef DO_SLI
2015 
2016 /*
2017  * Convert float16 to float32, raising no exceptions and
2018  * preserving exceptional values, including SNaN.
2019  * This is effectively an unpack+repack operation.
2020  */
2021 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2022 {
2023     const int f16_bias = 15;
2024     const int f32_bias = 127;
2025     uint32_t sign = extract32(f16, 15, 1);
2026     uint32_t exp = extract32(f16, 10, 5);
2027     uint32_t frac = extract32(f16, 0, 10);
2028 
2029     if (exp == 0x1f) {
2030         /* Inf or NaN */
2031         exp = 0xff;
2032     } else if (exp == 0) {
2033         /* Zero or denormal.  */
2034         if (frac != 0) {
2035             if (fz16) {
2036                 frac = 0;
2037             } else {
2038                 /*
2039                  * Denormal; these are all normal float32.
2040                  * Shift the fraction so that the msb is at bit 11,
2041                  * then remove bit 11 as the implicit bit of the
2042                  * normalized float32.  Note that we still go through
2043                  * the shift for normal numbers below, to put the
2044                  * float32 fraction at the right place.
2045                  */
2046                 int shift = clz32(frac) - 21;
2047                 frac = (frac << shift) & 0x3ff;
2048                 exp = f32_bias - f16_bias - shift + 1;
2049             }
2050         }
2051     } else {
2052         /* Normal number; adjust the bias.  */
2053         exp += f32_bias - f16_bias;
2054     }
2055     sign <<= 31;
2056     exp <<= 23;
2057     frac <<= 23 - 10;
2058 
2059     return sign | exp | frac;
2060 }
2061 
2062 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2063 {
2064     /*
2065      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2066      * Load the 2nd qword iff is_q & is_2.
2067      * Shift to the 2nd dword iff !is_q & is_2.
2068      * For !is_q & !is_2, the upper bits of the result are garbage.
2069      */
2070     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2071 }
2072 
2073 /*
2074  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2075  * as there is not yet SVE versions that might use blocking.
2076  */
2077 
2078 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2079                      uint32_t desc, bool fz16)
2080 {
2081     intptr_t i, oprsz = simd_oprsz(desc);
2082     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2083     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2084     int is_q = oprsz == 16;
2085     uint64_t n_4, m_4;
2086 
2087     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2088     n_4 = load4_f16(vn, is_q, is_2);
2089     m_4 = load4_f16(vm, is_q, is_2);
2090 
2091     /* Negate all inputs for FMLSL at once.  */
2092     if (is_s) {
2093         n_4 ^= 0x8000800080008000ull;
2094     }
2095 
2096     for (i = 0; i < oprsz / 4; i++) {
2097         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2098         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2099         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2100     }
2101     clear_tail(d, oprsz, simd_maxsz(desc));
2102 }
2103 
2104 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2105                             CPUARMState *env, uint32_t desc)
2106 {
2107     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2108              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2109 }
2110 
2111 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2112                             CPUARMState *env, uint32_t desc)
2113 {
2114     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2115              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2116 }
2117 
2118 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2119                                CPUARMState *env, uint32_t desc)
2120 {
2121     intptr_t i, oprsz = simd_oprsz(desc);
2122     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2123     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2124     float_status *status = &env->vfp.fp_status_a64;
2125     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2126 
2127     for (i = 0; i < oprsz; i += sizeof(float32)) {
2128         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2129         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2130         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2131         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2132         float32 aa = *(float32 *)(va + H1_4(i));
2133 
2134         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2135     }
2136 }
2137 
2138 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2139                          uint32_t desc, bool fz16)
2140 {
2141     intptr_t i, oprsz = simd_oprsz(desc);
2142     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2143     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2144     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2145     int is_q = oprsz == 16;
2146     uint64_t n_4;
2147     float32 m_1;
2148 
2149     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2150     n_4 = load4_f16(vn, is_q, is_2);
2151 
2152     /* Negate all inputs for FMLSL at once.  */
2153     if (is_s) {
2154         n_4 ^= 0x8000800080008000ull;
2155     }
2156 
2157     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2158 
2159     for (i = 0; i < oprsz / 4; i++) {
2160         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2161         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2162     }
2163     clear_tail(d, oprsz, simd_maxsz(desc));
2164 }
2165 
2166 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2167                                 CPUARMState *env, uint32_t desc)
2168 {
2169     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2170                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2171 }
2172 
2173 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2174                                 CPUARMState *env, uint32_t desc)
2175 {
2176     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2177                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2178 }
2179 
2180 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2181                                CPUARMState *env, uint32_t desc)
2182 {
2183     intptr_t i, j, oprsz = simd_oprsz(desc);
2184     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2185     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2186     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2187     float_status *status = &env->vfp.fp_status_a64;
2188     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2189 
2190     for (i = 0; i < oprsz; i += 16) {
2191         float16 mm_16 = *(float16 *)(vm + i + idx);
2192         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2193 
2194         for (j = 0; j < 16; j += sizeof(float32)) {
2195             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2196             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2197             float32 aa = *(float32 *)(va + H1_4(i + j));
2198 
2199             *(float32 *)(vd + H1_4(i + j)) =
2200                 float32_muladd(nn, mm, aa, 0, status);
2201         }
2202     }
2203 }
2204 
2205 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2206 {
2207     intptr_t i, opr_sz = simd_oprsz(desc);
2208     int8_t *d = vd, *n = vn, *m = vm;
2209 
2210     for (i = 0; i < opr_sz; ++i) {
2211         int8_t mm = m[i];
2212         int8_t nn = n[i];
2213         int8_t res = 0;
2214         if (mm >= 0) {
2215             if (mm < 8) {
2216                 res = nn << mm;
2217             }
2218         } else {
2219             res = nn >> (mm > -8 ? -mm : 7);
2220         }
2221         d[i] = res;
2222     }
2223     clear_tail(d, opr_sz, simd_maxsz(desc));
2224 }
2225 
2226 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2227 {
2228     intptr_t i, opr_sz = simd_oprsz(desc);
2229     int16_t *d = vd, *n = vn, *m = vm;
2230 
2231     for (i = 0; i < opr_sz / 2; ++i) {
2232         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2233         int16_t nn = n[i];
2234         int16_t res = 0;
2235         if (mm >= 0) {
2236             if (mm < 16) {
2237                 res = nn << mm;
2238             }
2239         } else {
2240             res = nn >> (mm > -16 ? -mm : 15);
2241         }
2242         d[i] = res;
2243     }
2244     clear_tail(d, opr_sz, simd_maxsz(desc));
2245 }
2246 
2247 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2248 {
2249     intptr_t i, opr_sz = simd_oprsz(desc);
2250     uint8_t *d = vd, *n = vn, *m = vm;
2251 
2252     for (i = 0; i < opr_sz; ++i) {
2253         int8_t mm = m[i];
2254         uint8_t nn = n[i];
2255         uint8_t res = 0;
2256         if (mm >= 0) {
2257             if (mm < 8) {
2258                 res = nn << mm;
2259             }
2260         } else {
2261             if (mm > -8) {
2262                 res = nn >> -mm;
2263             }
2264         }
2265         d[i] = res;
2266     }
2267     clear_tail(d, opr_sz, simd_maxsz(desc));
2268 }
2269 
2270 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2271 {
2272     intptr_t i, opr_sz = simd_oprsz(desc);
2273     uint16_t *d = vd, *n = vn, *m = vm;
2274 
2275     for (i = 0; i < opr_sz / 2; ++i) {
2276         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2277         uint16_t nn = n[i];
2278         uint16_t res = 0;
2279         if (mm >= 0) {
2280             if (mm < 16) {
2281                 res = nn << mm;
2282             }
2283         } else {
2284             if (mm > -16) {
2285                 res = nn >> -mm;
2286             }
2287         }
2288         d[i] = res;
2289     }
2290     clear_tail(d, opr_sz, simd_maxsz(desc));
2291 }
2292 
2293 /*
2294  * 8x8->8 polynomial multiply.
2295  *
2296  * Polynomial multiplication is like integer multiplication except the
2297  * partial products are XORed, not added.
2298  *
2299  * TODO: expose this as a generic vector operation, as it is a common
2300  * crypto building block.
2301  */
2302 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2303 {
2304     intptr_t i, opr_sz = simd_oprsz(desc);
2305     uint64_t *d = vd, *n = vn, *m = vm;
2306 
2307     for (i = 0; i < opr_sz / 8; ++i) {
2308         d[i] = clmul_8x8_low(n[i], m[i]);
2309     }
2310     clear_tail(d, opr_sz, simd_maxsz(desc));
2311 }
2312 
2313 /*
2314  * 64x64->128 polynomial multiply.
2315  * Because of the lanes are not accessed in strict columns,
2316  * this probably cannot be turned into a generic helper.
2317  */
2318 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2319 {
2320     intptr_t i, opr_sz = simd_oprsz(desc);
2321     intptr_t hi = simd_data(desc);
2322     uint64_t *d = vd, *n = vn, *m = vm;
2323 
2324     for (i = 0; i < opr_sz / 8; i += 2) {
2325         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2326         d[i] = int128_getlo(r);
2327         d[i + 1] = int128_gethi(r);
2328     }
2329     clear_tail(d, opr_sz, simd_maxsz(desc));
2330 }
2331 
2332 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2333 {
2334     int hi = simd_data(desc);
2335     uint64_t *d = vd, *n = vn, *m = vm;
2336     uint64_t nn = n[hi], mm = m[hi];
2337 
2338     d[0] = clmul_8x4_packed(nn, mm);
2339     nn >>= 32;
2340     mm >>= 32;
2341     d[1] = clmul_8x4_packed(nn, mm);
2342 
2343     clear_tail(d, 16, simd_maxsz(desc));
2344 }
2345 
2346 #ifdef TARGET_AARCH64
2347 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2348 {
2349     int shift = simd_data(desc) * 8;
2350     intptr_t i, opr_sz = simd_oprsz(desc);
2351     uint64_t *d = vd, *n = vn, *m = vm;
2352 
2353     for (i = 0; i < opr_sz / 8; ++i) {
2354         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2355     }
2356 }
2357 
2358 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2359 {
2360     intptr_t sel = H4(simd_data(desc));
2361     intptr_t i, opr_sz = simd_oprsz(desc);
2362     uint32_t *n = vn, *m = vm;
2363     uint64_t *d = vd;
2364 
2365     for (i = 0; i < opr_sz / 8; ++i) {
2366         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2367     }
2368 }
2369 #endif
2370 
2371 #define DO_CMP0(NAME, TYPE, OP)                         \
2372 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2373 {                                                       \
2374     intptr_t i, opr_sz = simd_oprsz(desc);              \
2375     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2376         TYPE nn = *(TYPE *)(vn + i);                    \
2377         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2378     }                                                   \
2379     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2380 }
2381 
2382 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2383 DO_CMP0(gvec_clt0_b, int8_t, <)
2384 DO_CMP0(gvec_cle0_b, int8_t, <=)
2385 DO_CMP0(gvec_cgt0_b, int8_t, >)
2386 DO_CMP0(gvec_cge0_b, int8_t, >=)
2387 
2388 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2389 DO_CMP0(gvec_clt0_h, int16_t, <)
2390 DO_CMP0(gvec_cle0_h, int16_t, <=)
2391 DO_CMP0(gvec_cgt0_h, int16_t, >)
2392 DO_CMP0(gvec_cge0_h, int16_t, >=)
2393 
2394 #undef DO_CMP0
2395 
2396 #define DO_ABD(NAME, TYPE)                                      \
2397 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2398 {                                                               \
2399     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2400     TYPE *d = vd, *n = vn, *m = vm;                             \
2401                                                                 \
2402     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2403         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2404     }                                                           \
2405     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2406 }
2407 
2408 DO_ABD(gvec_sabd_b, int8_t)
2409 DO_ABD(gvec_sabd_h, int16_t)
2410 DO_ABD(gvec_sabd_s, int32_t)
2411 DO_ABD(gvec_sabd_d, int64_t)
2412 
2413 DO_ABD(gvec_uabd_b, uint8_t)
2414 DO_ABD(gvec_uabd_h, uint16_t)
2415 DO_ABD(gvec_uabd_s, uint32_t)
2416 DO_ABD(gvec_uabd_d, uint64_t)
2417 
2418 #undef DO_ABD
2419 
2420 #define DO_ABA(NAME, TYPE)                                      \
2421 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2422 {                                                               \
2423     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2424     TYPE *d = vd, *n = vn, *m = vm;                             \
2425                                                                 \
2426     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2427         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2428     }                                                           \
2429     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2430 }
2431 
2432 DO_ABA(gvec_saba_b, int8_t)
2433 DO_ABA(gvec_saba_h, int16_t)
2434 DO_ABA(gvec_saba_s, int32_t)
2435 DO_ABA(gvec_saba_d, int64_t)
2436 
2437 DO_ABA(gvec_uaba_b, uint8_t)
2438 DO_ABA(gvec_uaba_h, uint16_t)
2439 DO_ABA(gvec_uaba_s, uint32_t)
2440 DO_ABA(gvec_uaba_d, uint64_t)
2441 
2442 #undef DO_ABA
2443 
2444 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2445 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2446                   float_status *stat, uint32_t desc)                       \
2447 {                                                                          \
2448     ARMVectorReg scratch;                                                  \
2449     intptr_t oprsz = simd_oprsz(desc);                                     \
2450     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2451     TYPE *d = vd, *n = vn, *m = vm;                                        \
2452     if (unlikely(d == m)) {                                                \
2453         m = memcpy(&scratch, m, oprsz);                                    \
2454     }                                                                      \
2455     for (intptr_t i = 0; i < half; ++i) {                                  \
2456         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2457     }                                                                      \
2458     for (intptr_t i = 0; i < half; ++i) {                                  \
2459         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2460     }                                                                      \
2461     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2462 }
2463 
2464 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2465 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2466 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2467 
2468 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2469 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2470 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2471 
2472 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2473 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2474 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2475 
2476 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2477 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2478 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2479 
2480 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2481 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2482 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2483 
2484 #ifdef TARGET_AARCH64
2485 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2486 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2487 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2488 
2489 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2490 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2491 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2492 #endif
2493 
2494 #undef DO_3OP_PAIR
2495 
2496 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2497 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2498 {                                                               \
2499     ARMVectorReg scratch;                                       \
2500     intptr_t oprsz = simd_oprsz(desc);                          \
2501     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2502     TYPE *d = vd, *n = vn, *m = vm;                             \
2503     if (unlikely(d == m)) {                                     \
2504         m = memcpy(&scratch, m, oprsz);                         \
2505     }                                                           \
2506     for (intptr_t i = 0; i < half; ++i) {                       \
2507         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2508     }                                                           \
2509     for (intptr_t i = 0; i < half; ++i) {                       \
2510         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2511     }                                                           \
2512     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2513 }
2514 
2515 #define ADD(A, B) (A + B)
2516 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2517 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2518 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2519 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2520 #undef  ADD
2521 
2522 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2523 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2524 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2525 
2526 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2527 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2528 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2529 
2530 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2531 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2532 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2533 
2534 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2535 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2536 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2537 
2538 #undef DO_3OP_PAIR
2539 
2540 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2541     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2542     {                                                                   \
2543         intptr_t i, oprsz = simd_oprsz(desc);                           \
2544         int shift = simd_data(desc);                                    \
2545         TYPE *d = vd, *n = vn;                                          \
2546         float_status *fpst = stat;                                      \
2547         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2548             d[i] = FUNC(n[i], shift, fpst);                             \
2549         }                                                               \
2550         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2551     }
2552 
2553 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2554 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2555 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2556 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2557 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2558 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2559 
2560 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2561 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2562 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2563 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2564 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2565 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2566 
2567 #undef DO_VCVT_FIXED
2568 
2569 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2570     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2571     {                                                                   \
2572         intptr_t i, oprsz = simd_oprsz(desc);                           \
2573         uint32_t rmode = simd_data(desc);                               \
2574         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2575         TYPE *d = vd, *n = vn;                                          \
2576         set_float_rounding_mode(rmode, fpst);                           \
2577         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2578             d[i] = FUNC(n[i], 0, fpst);                                 \
2579         }                                                               \
2580         set_float_rounding_mode(prev_rmode, fpst);                      \
2581         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2582     }
2583 
2584 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2585 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2586 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2587 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2588 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2589 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2590 
2591 #undef DO_VCVT_RMODE
2592 
2593 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2594     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2595     {                                                                   \
2596         intptr_t i, oprsz = simd_oprsz(desc);                           \
2597         uint32_t rmode = simd_data(desc);                               \
2598         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2599         TYPE *d = vd, *n = vn;                                          \
2600         set_float_rounding_mode(rmode, fpst);                           \
2601         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2602             d[i] = FUNC(n[i], fpst);                                    \
2603         }                                                               \
2604         set_float_rounding_mode(prev_rmode, fpst);                      \
2605         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2606     }
2607 
2608 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2609 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2610 
2611 #undef DO_VRINT_RMODE
2612 
2613 #ifdef TARGET_AARCH64
2614 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2615 {
2616     const uint8_t *indices = vm;
2617     size_t oprsz = simd_oprsz(desc);
2618     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2619     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2620     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2621     union {
2622         uint8_t b[16];
2623         uint64_t d[2];
2624     } result;
2625 
2626     /*
2627      * We must construct the final result in a temp, lest the output
2628      * overlaps the input table.  For TBL, begin with zero; for TBX,
2629      * begin with the original register contents.  Note that we always
2630      * copy 16 bytes here to avoid an extra branch; clearing the high
2631      * bits of the register for oprsz == 8 is handled below.
2632      */
2633     if (is_tbx) {
2634         memcpy(&result, vd, 16);
2635     } else {
2636         memset(&result, 0, 16);
2637     }
2638 
2639     for (size_t i = 0; i < oprsz; ++i) {
2640         uint32_t index = indices[H1(i)];
2641 
2642         if (index < table_len) {
2643             /*
2644              * Convert index (a byte offset into the virtual table
2645              * which is a series of 128-bit vectors concatenated)
2646              * into the correct register element, bearing in mind
2647              * that the table can wrap around from V31 to V0.
2648              */
2649             const uint8_t *table = (const uint8_t *)
2650                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2651             result.b[H1(i)] = table[H1(index % 16)];
2652         }
2653     }
2654 
2655     memcpy(vd, &result, 16);
2656     clear_tail(vd, oprsz, simd_maxsz(desc));
2657 }
2658 #endif
2659 
2660 /*
2661  * NxN -> N highpart multiply
2662  *
2663  * TODO: expose this as a generic vector operation.
2664  */
2665 
2666 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2667 {
2668     intptr_t i, opr_sz = simd_oprsz(desc);
2669     int8_t *d = vd, *n = vn, *m = vm;
2670 
2671     for (i = 0; i < opr_sz; ++i) {
2672         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2673     }
2674     clear_tail(d, opr_sz, simd_maxsz(desc));
2675 }
2676 
2677 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2678 {
2679     intptr_t i, opr_sz = simd_oprsz(desc);
2680     int16_t *d = vd, *n = vn, *m = vm;
2681 
2682     for (i = 0; i < opr_sz / 2; ++i) {
2683         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2684     }
2685     clear_tail(d, opr_sz, simd_maxsz(desc));
2686 }
2687 
2688 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2689 {
2690     intptr_t i, opr_sz = simd_oprsz(desc);
2691     int32_t *d = vd, *n = vn, *m = vm;
2692 
2693     for (i = 0; i < opr_sz / 4; ++i) {
2694         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2695     }
2696     clear_tail(d, opr_sz, simd_maxsz(desc));
2697 }
2698 
2699 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2700 {
2701     intptr_t i, opr_sz = simd_oprsz(desc);
2702     uint64_t *d = vd, *n = vn, *m = vm;
2703     uint64_t discard;
2704 
2705     for (i = 0; i < opr_sz / 8; ++i) {
2706         muls64(&discard, &d[i], n[i], m[i]);
2707     }
2708     clear_tail(d, opr_sz, simd_maxsz(desc));
2709 }
2710 
2711 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2712 {
2713     intptr_t i, opr_sz = simd_oprsz(desc);
2714     uint8_t *d = vd, *n = vn, *m = vm;
2715 
2716     for (i = 0; i < opr_sz; ++i) {
2717         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2718     }
2719     clear_tail(d, opr_sz, simd_maxsz(desc));
2720 }
2721 
2722 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2723 {
2724     intptr_t i, opr_sz = simd_oprsz(desc);
2725     uint16_t *d = vd, *n = vn, *m = vm;
2726 
2727     for (i = 0; i < opr_sz / 2; ++i) {
2728         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2729     }
2730     clear_tail(d, opr_sz, simd_maxsz(desc));
2731 }
2732 
2733 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2734 {
2735     intptr_t i, opr_sz = simd_oprsz(desc);
2736     uint32_t *d = vd, *n = vn, *m = vm;
2737 
2738     for (i = 0; i < opr_sz / 4; ++i) {
2739         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2740     }
2741     clear_tail(d, opr_sz, simd_maxsz(desc));
2742 }
2743 
2744 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2745 {
2746     intptr_t i, opr_sz = simd_oprsz(desc);
2747     uint64_t *d = vd, *n = vn, *m = vm;
2748     uint64_t discard;
2749 
2750     for (i = 0; i < opr_sz / 8; ++i) {
2751         mulu64(&discard, &d[i], n[i], m[i]);
2752     }
2753     clear_tail(d, opr_sz, simd_maxsz(desc));
2754 }
2755 
2756 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2757 {
2758     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2759     int shr = simd_data(desc);
2760     uint64_t *d = vd, *n = vn, *m = vm;
2761 
2762     for (i = 0; i < opr_sz; ++i) {
2763         d[i] = ror64(n[i] ^ m[i], shr);
2764     }
2765     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2766 }
2767 
2768 /*
2769  * Integer matrix-multiply accumulate
2770  */
2771 
2772 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2773 {
2774     int8_t *n = vn, *m = vm;
2775 
2776     for (intptr_t k = 0; k < 8; ++k) {
2777         sum += n[H1(k)] * m[H1(k)];
2778     }
2779     return sum;
2780 }
2781 
2782 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2783 {
2784     uint8_t *n = vn, *m = vm;
2785 
2786     for (intptr_t k = 0; k < 8; ++k) {
2787         sum += n[H1(k)] * m[H1(k)];
2788     }
2789     return sum;
2790 }
2791 
2792 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2793 {
2794     uint8_t *n = vn;
2795     int8_t *m = vm;
2796 
2797     for (intptr_t k = 0; k < 8; ++k) {
2798         sum += n[H1(k)] * m[H1(k)];
2799     }
2800     return sum;
2801 }
2802 
2803 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2804                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2805 {
2806     intptr_t seg, opr_sz = simd_oprsz(desc);
2807 
2808     for (seg = 0; seg < opr_sz; seg += 16) {
2809         uint32_t *d = vd + seg;
2810         uint32_t *a = va + seg;
2811         uint32_t sum0, sum1, sum2, sum3;
2812 
2813         /*
2814          * Process the entire segment at once, writing back the
2815          * results only after we've consumed all of the inputs.
2816          *
2817          * Key to indices by column:
2818          *          i   j                  i             j
2819          */
2820         sum0 = a[H4(0 + 0)];
2821         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2822         sum1 = a[H4(0 + 1)];
2823         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2824         sum2 = a[H4(2 + 0)];
2825         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2826         sum3 = a[H4(2 + 1)];
2827         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2828 
2829         d[H4(0)] = sum0;
2830         d[H4(1)] = sum1;
2831         d[H4(2)] = sum2;
2832         d[H4(3)] = sum3;
2833     }
2834     clear_tail(vd, opr_sz, simd_maxsz(desc));
2835 }
2836 
2837 #define DO_MMLA_B(NAME, INNER) \
2838     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2839     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2840 
2841 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2842 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2843 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2844 
2845 /*
2846  * BFloat16 Dot Product
2847  */
2848 
2849 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2850 {
2851     /*
2852      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2853      * For EBF = 0, we ignore the FPCR bits which determine rounding
2854      * mode and denormal-flushing, and we do unfused multiplies and
2855      * additions with intermediate rounding of all products and sums.
2856      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2857      * and we perform a fused two-way sum-of-products without intermediate
2858      * rounding of the products.
2859      * In either case, we don't set fp exception flags.
2860      *
2861      * EBF is AArch64 only, so even if it's set in the FPCR it has
2862      * no effect on AArch32 instructions.
2863      */
2864     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2865 
2866     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2867     set_default_nan_mode(true, statusp);
2868 
2869     if (ebf) {
2870         /* EBF=1 needs to do a step with round-to-odd semantics */
2871         *oddstatusp = *statusp;
2872         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2873     } else {
2874         set_flush_to_zero(true, statusp);
2875         set_flush_inputs_to_zero(true, statusp);
2876         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2877     }
2878     return ebf;
2879 }
2880 
2881 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2882 {
2883     float32 t1, t2;
2884 
2885     /*
2886      * Extract each BFloat16 from the element pair, and shift
2887      * them such that they become float32.
2888      */
2889     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2890     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2891     t1 = float32_add(t1, t2, fpst);
2892     t1 = float32_add(sum, t1, fpst);
2893 
2894     return t1;
2895 }
2896 
2897 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2898                      float_status *fpst, float_status *fpst_odd)
2899 {
2900     /*
2901      * Compare f16_dotadd() in sme_helper.c, but here we have
2902      * bfloat16 inputs. In particular that means that we do not
2903      * want the FPCR.FZ16 flush semantics, so we use the normal
2904      * float_status for the input handling here.
2905      */
2906     float64 e1r = float32_to_float64(e1 << 16, fpst);
2907     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2908     float64 e2r = float32_to_float64(e2 << 16, fpst);
2909     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2910     float64 t64;
2911     float32 t32;
2912 
2913     /*
2914      * The ARM pseudocode function FPDot performs both multiplies
2915      * and the add with a single rounding operation.  Emulate this
2916      * by performing the first multiply in round-to-odd, then doing
2917      * the second multiply as fused multiply-add, and rounding to
2918      * float32 all in one step.
2919      */
2920     t64 = float64_mul(e1r, e2r, fpst_odd);
2921     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2922 
2923     /* This conversion is exact, because we've already rounded. */
2924     t32 = float64_to_float32(t64, fpst);
2925 
2926     /* The final accumulation step is not fused. */
2927     return float32_add(sum, t32, fpst);
2928 }
2929 
2930 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2931                         CPUARMState *env, uint32_t desc)
2932 {
2933     intptr_t i, opr_sz = simd_oprsz(desc);
2934     float32 *d = vd, *a = va;
2935     uint32_t *n = vn, *m = vm;
2936     float_status fpst, fpst_odd;
2937 
2938     if (is_ebf(env, &fpst, &fpst_odd)) {
2939         for (i = 0; i < opr_sz / 4; ++i) {
2940             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2941         }
2942     } else {
2943         for (i = 0; i < opr_sz / 4; ++i) {
2944             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2945         }
2946     }
2947     clear_tail(d, opr_sz, simd_maxsz(desc));
2948 }
2949 
2950 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2951                             void *va, CPUARMState *env, uint32_t desc)
2952 {
2953     intptr_t i, j, opr_sz = simd_oprsz(desc);
2954     intptr_t index = simd_data(desc);
2955     intptr_t elements = opr_sz / 4;
2956     intptr_t eltspersegment = MIN(16 / 4, elements);
2957     float32 *d = vd, *a = va;
2958     uint32_t *n = vn, *m = vm;
2959     float_status fpst, fpst_odd;
2960 
2961     if (is_ebf(env, &fpst, &fpst_odd)) {
2962         for (i = 0; i < elements; i += eltspersegment) {
2963             uint32_t m_idx = m[i + H4(index)];
2964 
2965             for (j = i; j < i + eltspersegment; j++) {
2966                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2967             }
2968         }
2969     } else {
2970         for (i = 0; i < elements; i += eltspersegment) {
2971             uint32_t m_idx = m[i + H4(index)];
2972 
2973             for (j = i; j < i + eltspersegment; j++) {
2974                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2975             }
2976         }
2977     }
2978     clear_tail(d, opr_sz, simd_maxsz(desc));
2979 }
2980 
2981 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2982                          CPUARMState *env, uint32_t desc)
2983 {
2984     intptr_t s, opr_sz = simd_oprsz(desc);
2985     float32 *d = vd, *a = va;
2986     uint32_t *n = vn, *m = vm;
2987     float_status fpst, fpst_odd;
2988 
2989     if (is_ebf(env, &fpst, &fpst_odd)) {
2990         for (s = 0; s < opr_sz / 4; s += 4) {
2991             float32 sum00, sum01, sum10, sum11;
2992 
2993             /*
2994              * Process the entire segment at once, writing back the
2995              * results only after we've consumed all of the inputs.
2996              *
2997              * Key to indices by column:
2998              *               i   j               i   k             j   k
2999              */
3000             sum00 = a[s + H4(0 + 0)];
3001             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3002             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3003 
3004             sum01 = a[s + H4(0 + 1)];
3005             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3006             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3007 
3008             sum10 = a[s + H4(2 + 0)];
3009             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3010             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3011 
3012             sum11 = a[s + H4(2 + 1)];
3013             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3014             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3015 
3016             d[s + H4(0 + 0)] = sum00;
3017             d[s + H4(0 + 1)] = sum01;
3018             d[s + H4(2 + 0)] = sum10;
3019             d[s + H4(2 + 1)] = sum11;
3020         }
3021     } else {
3022         for (s = 0; s < opr_sz / 4; s += 4) {
3023             float32 sum00, sum01, sum10, sum11;
3024 
3025             /*
3026              * Process the entire segment at once, writing back the
3027              * results only after we've consumed all of the inputs.
3028              *
3029              * Key to indices by column:
3030              *               i   j           i   k             j   k
3031              */
3032             sum00 = a[s + H4(0 + 0)];
3033             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3034             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3035 
3036             sum01 = a[s + H4(0 + 1)];
3037             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3038             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3039 
3040             sum10 = a[s + H4(2 + 0)];
3041             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3042             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3043 
3044             sum11 = a[s + H4(2 + 1)];
3045             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3046             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3047 
3048             d[s + H4(0 + 0)] = sum00;
3049             d[s + H4(0 + 1)] = sum01;
3050             d[s + H4(2 + 0)] = sum10;
3051             d[s + H4(2 + 1)] = sum11;
3052         }
3053     }
3054     clear_tail(d, opr_sz, simd_maxsz(desc));
3055 }
3056 
3057 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3058                          float_status *stat, uint32_t desc)
3059 {
3060     intptr_t i, opr_sz = simd_oprsz(desc);
3061     intptr_t sel = simd_data(desc);
3062     float32 *d = vd, *a = va;
3063     bfloat16 *n = vn, *m = vm;
3064 
3065     for (i = 0; i < opr_sz / 4; ++i) {
3066         float32 nn = n[H2(i * 2 + sel)] << 16;
3067         float32 mm = m[H2(i * 2 + sel)] << 16;
3068         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3069     }
3070     clear_tail(d, opr_sz, simd_maxsz(desc));
3071 }
3072 
3073 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3074                              void *va, float_status *stat, uint32_t desc)
3075 {
3076     intptr_t i, j, opr_sz = simd_oprsz(desc);
3077     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3078     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3079     intptr_t elements = opr_sz / 4;
3080     intptr_t eltspersegment = MIN(16 / 4, elements);
3081     float32 *d = vd, *a = va;
3082     bfloat16 *n = vn, *m = vm;
3083 
3084     for (i = 0; i < elements; i += eltspersegment) {
3085         float32 m_idx = m[H2(2 * i + index)] << 16;
3086 
3087         for (j = i; j < i + eltspersegment; j++) {
3088             float32 n_j = n[H2(2 * j + sel)] << 16;
3089             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3090         }
3091     }
3092     clear_tail(d, opr_sz, simd_maxsz(desc));
3093 }
3094 
3095 #define DO_CLAMP(NAME, TYPE) \
3096 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3097 {                                                                       \
3098     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3099     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3100         TYPE aa = *(TYPE *)(a + i);                                     \
3101         TYPE nn = *(TYPE *)(n + i);                                     \
3102         TYPE mm = *(TYPE *)(m + i);                                     \
3103         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3104         *(TYPE *)(d + i) = dd;                                          \
3105     }                                                                   \
3106     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3107 }
3108 
3109 DO_CLAMP(gvec_sclamp_b, int8_t)
3110 DO_CLAMP(gvec_sclamp_h, int16_t)
3111 DO_CLAMP(gvec_sclamp_s, int32_t)
3112 DO_CLAMP(gvec_sclamp_d, int64_t)
3113 
3114 DO_CLAMP(gvec_uclamp_b, uint8_t)
3115 DO_CLAMP(gvec_uclamp_h, uint16_t)
3116 DO_CLAMP(gvec_uclamp_s, uint32_t)
3117 DO_CLAMP(gvec_uclamp_d, uint64_t)
3118 
3119 /* Bit count in each 8-bit word. */
3120 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3121 {
3122     intptr_t i, opr_sz = simd_oprsz(desc);
3123     uint8_t *d = vd, *n = vn;
3124 
3125     for (i = 0; i < opr_sz; ++i) {
3126         d[i] = ctpop8(n[i]);
3127     }
3128     clear_tail(d, opr_sz, simd_maxsz(desc));
3129 }
3130 
3131 /* Reverse bits in each 8 bit word */
3132 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3133 {
3134     intptr_t i, opr_sz = simd_oprsz(desc);
3135     uint64_t *d = vd, *n = vn;
3136 
3137     for (i = 0; i < opr_sz / 8; ++i) {
3138         d[i] = revbit64(bswap64(n[i]));
3139     }
3140     clear_tail(d, opr_sz, simd_maxsz(desc));
3141 }
3142 
3143 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3144 {
3145     intptr_t i, opr_sz = simd_oprsz(desc);
3146     uint32_t *d = vd, *n = vn;
3147 
3148     for (i = 0; i < opr_sz / 4; ++i) {
3149         d[i] = helper_recpe_u32(n[i]);
3150     }
3151     clear_tail(d, opr_sz, simd_maxsz(desc));
3152 }
3153 
3154 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3155 {
3156     intptr_t i, opr_sz = simd_oprsz(desc);
3157     uint32_t *d = vd, *n = vn;
3158 
3159     for (i = 0; i < opr_sz / 4; ++i) {
3160         d[i] = helper_rsqrte_u32(n[i]);
3161     }
3162     clear_tail(d, opr_sz, simd_maxsz(desc));
3163 }
3164