xref: /qemu/target/arm/tcg/vec_helper.c (revision 5f9976486970b0fec50ff4c07da7af620cd7d0a0)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          void *vfpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     float_status *fpst = vfpst;
883     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
884     uint32_t neg_imag = neg_real ^ 1;
885     uintptr_t i;
886 
887     /* Shift boolean to the sign bit so we can xor to negate.  */
888     neg_real <<= 15;
889     neg_imag <<= 15;
890 
891     for (i = 0; i < opr_sz / 2; i += 2) {
892         float16 e0 = n[H2(i)];
893         float16 e1 = m[H2(i + 1)] ^ neg_imag;
894         float16 e2 = n[H2(i + 1)];
895         float16 e3 = m[H2(i)] ^ neg_real;
896 
897         d[H2(i)] = float16_add(e0, e1, fpst);
898         d[H2(i + 1)] = float16_add(e2, e3, fpst);
899     }
900     clear_tail(d, opr_sz, simd_maxsz(desc));
901 }
902 
903 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
904                          void *vfpst, uint32_t desc)
905 {
906     uintptr_t opr_sz = simd_oprsz(desc);
907     float32 *d = vd;
908     float32 *n = vn;
909     float32 *m = vm;
910     float_status *fpst = vfpst;
911     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
912     uint32_t neg_imag = neg_real ^ 1;
913     uintptr_t i;
914 
915     /* Shift boolean to the sign bit so we can xor to negate.  */
916     neg_real <<= 31;
917     neg_imag <<= 31;
918 
919     for (i = 0; i < opr_sz / 4; i += 2) {
920         float32 e0 = n[H4(i)];
921         float32 e1 = m[H4(i + 1)] ^ neg_imag;
922         float32 e2 = n[H4(i + 1)];
923         float32 e3 = m[H4(i)] ^ neg_real;
924 
925         d[H4(i)] = float32_add(e0, e1, fpst);
926         d[H4(i + 1)] = float32_add(e2, e3, fpst);
927     }
928     clear_tail(d, opr_sz, simd_maxsz(desc));
929 }
930 
931 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
932                          void *vfpst, uint32_t desc)
933 {
934     uintptr_t opr_sz = simd_oprsz(desc);
935     float64 *d = vd;
936     float64 *n = vn;
937     float64 *m = vm;
938     float_status *fpst = vfpst;
939     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
940     uint64_t neg_imag = neg_real ^ 1;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e0 = n[i];
949         float64 e1 = m[i + 1] ^ neg_imag;
950         float64 e2 = n[i + 1];
951         float64 e3 = m[i] ^ neg_real;
952 
953         d[i] = float64_add(e0, e1, fpst);
954         d[i + 1] = float64_add(e2, e3, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
960                          void *vfpst, uint32_t desc)
961 {
962     uintptr_t opr_sz = simd_oprsz(desc);
963     float16 *d = vd, *n = vn, *m = vm, *a = va;
964     float_status *fpst = vfpst;
965     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
966     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
967     uint32_t neg_real = flip ^ neg_imag;
968     uintptr_t i;
969 
970     /* Shift boolean to the sign bit so we can xor to negate.  */
971     neg_real <<= 15;
972     neg_imag <<= 15;
973 
974     for (i = 0; i < opr_sz / 2; i += 2) {
975         float16 e2 = n[H2(i + flip)];
976         float16 e1 = m[H2(i + flip)] ^ neg_real;
977         float16 e4 = e2;
978         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
979 
980         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
981         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
982     }
983     clear_tail(d, opr_sz, simd_maxsz(desc));
984 }
985 
986 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
987                              void *vfpst, uint32_t desc)
988 {
989     uintptr_t opr_sz = simd_oprsz(desc);
990     float16 *d = vd, *n = vn, *m = vm, *a = va;
991     float_status *fpst = vfpst;
992     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
993     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
994     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
995     uint32_t neg_real = flip ^ neg_imag;
996     intptr_t elements = opr_sz / sizeof(float16);
997     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
998     intptr_t i, j;
999 
1000     /* Shift boolean to the sign bit so we can xor to negate.  */
1001     neg_real <<= 15;
1002     neg_imag <<= 15;
1003 
1004     for (i = 0; i < elements; i += eltspersegment) {
1005         float16 mr = m[H2(i + 2 * index + 0)];
1006         float16 mi = m[H2(i + 2 * index + 1)];
1007         float16 e1 = neg_real ^ (flip ? mi : mr);
1008         float16 e3 = neg_imag ^ (flip ? mr : mi);
1009 
1010         for (j = i; j < i + eltspersegment; j += 2) {
1011             float16 e2 = n[H2(j + flip)];
1012             float16 e4 = e2;
1013 
1014             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1015             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1016         }
1017     }
1018     clear_tail(d, opr_sz, simd_maxsz(desc));
1019 }
1020 
1021 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1022                          void *vfpst, uint32_t desc)
1023 {
1024     uintptr_t opr_sz = simd_oprsz(desc);
1025     float32 *d = vd, *n = vn, *m = vm, *a = va;
1026     float_status *fpst = vfpst;
1027     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1028     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1029     uint32_t neg_real = flip ^ neg_imag;
1030     uintptr_t i;
1031 
1032     /* Shift boolean to the sign bit so we can xor to negate.  */
1033     neg_real <<= 31;
1034     neg_imag <<= 31;
1035 
1036     for (i = 0; i < opr_sz / 4; i += 2) {
1037         float32 e2 = n[H4(i + flip)];
1038         float32 e1 = m[H4(i + flip)] ^ neg_real;
1039         float32 e4 = e2;
1040         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1041 
1042         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1043         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1044     }
1045     clear_tail(d, opr_sz, simd_maxsz(desc));
1046 }
1047 
1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1049                              void *vfpst, uint32_t desc)
1050 {
1051     uintptr_t opr_sz = simd_oprsz(desc);
1052     float32 *d = vd, *n = vn, *m = vm, *a = va;
1053     float_status *fpst = vfpst;
1054     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1055     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1056     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1057     uint32_t neg_real = flip ^ neg_imag;
1058     intptr_t elements = opr_sz / sizeof(float32);
1059     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1060     intptr_t i, j;
1061 
1062     /* Shift boolean to the sign bit so we can xor to negate.  */
1063     neg_real <<= 31;
1064     neg_imag <<= 31;
1065 
1066     for (i = 0; i < elements; i += eltspersegment) {
1067         float32 mr = m[H4(i + 2 * index + 0)];
1068         float32 mi = m[H4(i + 2 * index + 1)];
1069         float32 e1 = neg_real ^ (flip ? mi : mr);
1070         float32 e3 = neg_imag ^ (flip ? mr : mi);
1071 
1072         for (j = i; j < i + eltspersegment; j += 2) {
1073             float32 e2 = n[H4(j + flip)];
1074             float32 e4 = e2;
1075 
1076             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1077             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1078         }
1079     }
1080     clear_tail(d, opr_sz, simd_maxsz(desc));
1081 }
1082 
1083 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1084                          void *vfpst, uint32_t desc)
1085 {
1086     uintptr_t opr_sz = simd_oprsz(desc);
1087     float64 *d = vd, *n = vn, *m = vm, *a = va;
1088     float_status *fpst = vfpst;
1089     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1090     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1091     uint64_t neg_real = flip ^ neg_imag;
1092     uintptr_t i;
1093 
1094     /* Shift boolean to the sign bit so we can xor to negate.  */
1095     neg_real <<= 63;
1096     neg_imag <<= 63;
1097 
1098     for (i = 0; i < opr_sz / 8; i += 2) {
1099         float64 e2 = n[i + flip];
1100         float64 e1 = m[i + flip] ^ neg_real;
1101         float64 e4 = e2;
1102         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1103 
1104         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1105         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1106     }
1107     clear_tail(d, opr_sz, simd_maxsz(desc));
1108 }
1109 
1110 /*
1111  * Floating point comparisons producing an integer result (all 1s or all 0s).
1112  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1113  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1114  */
1115 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1116 {
1117     return -float16_eq_quiet(op1, op2, stat);
1118 }
1119 
1120 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1121 {
1122     return -float32_eq_quiet(op1, op2, stat);
1123 }
1124 
1125 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1126 {
1127     return -float64_eq_quiet(op1, op2, stat);
1128 }
1129 
1130 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1131 {
1132     return -float16_le(op2, op1, stat);
1133 }
1134 
1135 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1136 {
1137     return -float32_le(op2, op1, stat);
1138 }
1139 
1140 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1141 {
1142     return -float64_le(op2, op1, stat);
1143 }
1144 
1145 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1146 {
1147     return -float16_lt(op2, op1, stat);
1148 }
1149 
1150 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1151 {
1152     return -float32_lt(op2, op1, stat);
1153 }
1154 
1155 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1156 {
1157     return -float64_lt(op2, op1, stat);
1158 }
1159 
1160 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1161 {
1162     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1163 }
1164 
1165 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1166 {
1167     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1168 }
1169 
1170 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1171 {
1172     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1173 }
1174 
1175 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1176 {
1177     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1178 }
1179 
1180 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1181 {
1182     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1183 }
1184 
1185 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1186 {
1187     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1188 }
1189 
1190 static int16_t vfp_tosszh(float16 x, void *fpstp)
1191 {
1192     float_status *fpst = fpstp;
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_int16_round_to_zero(x, fpst);
1198 }
1199 
1200 static uint16_t vfp_touszh(float16 x, void *fpstp)
1201 {
1202     float_status *fpst = fpstp;
1203     if (float16_is_any_nan(x)) {
1204         float_raise(float_flag_invalid, fpst);
1205         return 0;
1206     }
1207     return float16_to_uint16_round_to_zero(x, fpst);
1208 }
1209 
1210 #define DO_2OP(NAME, FUNC, TYPE) \
1211 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1212 {                                                                 \
1213     intptr_t i, oprsz = simd_oprsz(desc);                         \
1214     TYPE *d = vd, *n = vn;                                        \
1215     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1216         d[i] = FUNC(n[i], stat);                                  \
1217     }                                                             \
1218     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1219 }
1220 
1221 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1222 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1223 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1224 
1225 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1226 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1227 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1228 
1229 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1230 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1231 
1232 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1233 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1234 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1235 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1236 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1237 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1238 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1239 DO_2OP(gvec_touszh, vfp_touszh, float16)
1240 
1241 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1242     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1243     {                                                           \
1244         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1245     }
1246 
1247 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1248     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1249     {                                                           \
1250         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1251     }
1252 
1253 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1254     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1255     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1256     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1257     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1258     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1259     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1260 
1261 DO_2OP_CMP0(cgt, cgt, FWD)
1262 DO_2OP_CMP0(cge, cge, FWD)
1263 DO_2OP_CMP0(ceq, ceq, FWD)
1264 DO_2OP_CMP0(clt, cgt, REV)
1265 DO_2OP_CMP0(cle, cge, REV)
1266 
1267 #undef DO_2OP
1268 #undef DO_2OP_CMP0
1269 
1270 /* Floating-point trigonometric starting value.
1271  * See the ARM ARM pseudocode function FPTrigSMul.
1272  */
1273 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1274 {
1275     float16 result = float16_mul(op1, op1, stat);
1276     if (!float16_is_any_nan(result)) {
1277         result = float16_set_sign(result, op2 & 1);
1278     }
1279     return result;
1280 }
1281 
1282 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1283 {
1284     float32 result = float32_mul(op1, op1, stat);
1285     if (!float32_is_any_nan(result)) {
1286         result = float32_set_sign(result, op2 & 1);
1287     }
1288     return result;
1289 }
1290 
1291 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1292 {
1293     float64 result = float64_mul(op1, op1, stat);
1294     if (!float64_is_any_nan(result)) {
1295         result = float64_set_sign(result, op2 & 1);
1296     }
1297     return result;
1298 }
1299 
1300 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1301 {
1302     return float16_abs(float16_sub(op1, op2, stat));
1303 }
1304 
1305 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1306 {
1307     return float32_abs(float32_sub(op1, op2, stat));
1308 }
1309 
1310 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1311 {
1312     return float64_abs(float64_sub(op1, op2, stat));
1313 }
1314 
1315 /*
1316  * Reciprocal step. These are the AArch32 version which uses a
1317  * non-fused multiply-and-subtract.
1318  */
1319 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1320 {
1321     op1 = float16_squash_input_denormal(op1, stat);
1322     op2 = float16_squash_input_denormal(op2, stat);
1323 
1324     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1325         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1326         return float16_two;
1327     }
1328     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1329 }
1330 
1331 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1332 {
1333     op1 = float32_squash_input_denormal(op1, stat);
1334     op2 = float32_squash_input_denormal(op2, stat);
1335 
1336     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1337         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1338         return float32_two;
1339     }
1340     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1341 }
1342 
1343 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1344 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1345 {
1346     op1 = float16_squash_input_denormal(op1, stat);
1347     op2 = float16_squash_input_denormal(op2, stat);
1348 
1349     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1350         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1351         return float16_one_point_five;
1352     }
1353     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1354     return float16_div(op1, float16_two, stat);
1355 }
1356 
1357 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1358 {
1359     op1 = float32_squash_input_denormal(op1, stat);
1360     op2 = float32_squash_input_denormal(op2, stat);
1361 
1362     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1363         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1364         return float32_one_point_five;
1365     }
1366     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1367     return float32_div(op1, float32_two, stat);
1368 }
1369 
1370 #define DO_3OP(NAME, FUNC, TYPE) \
1371 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1372 {                                                                          \
1373     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1374     TYPE *d = vd, *n = vn, *m = vm;                                        \
1375     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1376         d[i] = FUNC(n[i], m[i], stat);                                     \
1377     }                                                                      \
1378     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1379 }
1380 
1381 DO_3OP(gvec_fadd_h, float16_add, float16)
1382 DO_3OP(gvec_fadd_s, float32_add, float32)
1383 DO_3OP(gvec_fadd_d, float64_add, float64)
1384 
1385 DO_3OP(gvec_fsub_h, float16_sub, float16)
1386 DO_3OP(gvec_fsub_s, float32_sub, float32)
1387 DO_3OP(gvec_fsub_d, float64_sub, float64)
1388 
1389 DO_3OP(gvec_fmul_h, float16_mul, float16)
1390 DO_3OP(gvec_fmul_s, float32_mul, float32)
1391 DO_3OP(gvec_fmul_d, float64_mul, float64)
1392 
1393 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1394 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1395 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1396 
1397 DO_3OP(gvec_fabd_h, float16_abd, float16)
1398 DO_3OP(gvec_fabd_s, float32_abd, float32)
1399 DO_3OP(gvec_fabd_d, float64_abd, float64)
1400 
1401 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1402 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1403 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1404 
1405 DO_3OP(gvec_fcge_h, float16_cge, float16)
1406 DO_3OP(gvec_fcge_s, float32_cge, float32)
1407 DO_3OP(gvec_fcge_d, float64_cge, float64)
1408 
1409 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1410 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1411 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1412 
1413 DO_3OP(gvec_facge_h, float16_acge, float16)
1414 DO_3OP(gvec_facge_s, float32_acge, float32)
1415 DO_3OP(gvec_facge_d, float64_acge, float64)
1416 
1417 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1418 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1419 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1420 
1421 DO_3OP(gvec_fmax_h, float16_max, float16)
1422 DO_3OP(gvec_fmax_s, float32_max, float32)
1423 DO_3OP(gvec_fmax_d, float64_max, float64)
1424 
1425 DO_3OP(gvec_fmin_h, float16_min, float16)
1426 DO_3OP(gvec_fmin_s, float32_min, float32)
1427 DO_3OP(gvec_fmin_d, float64_min, float64)
1428 
1429 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1430 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1431 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1432 
1433 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1434 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1435 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1436 
1437 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1438 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1439 
1440 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1441 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1442 
1443 #ifdef TARGET_AARCH64
1444 DO_3OP(gvec_fdiv_h, float16_div, float16)
1445 DO_3OP(gvec_fdiv_s, float32_div, float32)
1446 DO_3OP(gvec_fdiv_d, float64_div, float64)
1447 
1448 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1449 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1450 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1451 
1452 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1453 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1454 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1455 
1456 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1457 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1458 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1459 
1460 #endif
1461 #undef DO_3OP
1462 
1463 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1464 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1465                                  float_status *stat)
1466 {
1467     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1468 }
1469 
1470 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1471                                  float_status *stat)
1472 {
1473     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1474 }
1475 
1476 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1477                                  float_status *stat)
1478 {
1479     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1480 }
1481 
1482 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1483                                  float_status *stat)
1484 {
1485     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1486 }
1487 
1488 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1489 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1490                                 float_status *stat)
1491 {
1492     return float16_muladd(op1, op2, dest, 0, stat);
1493 }
1494 
1495 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1496                                  float_status *stat)
1497 {
1498     return float32_muladd(op1, op2, dest, 0, stat);
1499 }
1500 
1501 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1502                                  float_status *stat)
1503 {
1504     return float64_muladd(op1, op2, dest, 0, stat);
1505 }
1506 
1507 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1508                                  float_status *stat)
1509 {
1510     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1511 }
1512 
1513 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1514                                  float_status *stat)
1515 {
1516     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1517 }
1518 
1519 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1520                                  float_status *stat)
1521 {
1522     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1523 }
1524 
1525 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1527 {                                                                          \
1528     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1529     TYPE *d = vd, *n = vn, *m = vm;                                        \
1530     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1531         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1532     }                                                                      \
1533     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1534 }
1535 
1536 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1537 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1538 
1539 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1540 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1541 
1542 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1543 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1544 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1545 
1546 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1547 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1548 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1549 
1550 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1551  * For AdvSIMD, there is of course only one such vector segment.
1552  */
1553 
1554 #define DO_MUL_IDX(NAME, TYPE, H) \
1555 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1556 {                                                                          \
1557     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1558     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1559     intptr_t idx = simd_data(desc);                                        \
1560     TYPE *d = vd, *n = vn, *m = vm;                                        \
1561     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1562         TYPE mm = m[H(i + idx)];                                           \
1563         for (j = 0; j < segment; j++) {                                    \
1564             d[i + j] = n[i + j] * mm;                                      \
1565         }                                                                  \
1566     }                                                                      \
1567     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1568 }
1569 
1570 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1571 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1572 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1573 
1574 #undef DO_MUL_IDX
1575 
1576 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1577 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1578 {                                                                          \
1579     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1580     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1581     intptr_t idx = simd_data(desc);                                        \
1582     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1583     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1584         TYPE mm = m[H(i + idx)];                                           \
1585         for (j = 0; j < segment; j++) {                                    \
1586             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1587         }                                                                  \
1588     }                                                                      \
1589     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1590 }
1591 
1592 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1593 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1594 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1595 
1596 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1597 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1598 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1599 
1600 #undef DO_MLA_IDX
1601 
1602 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1603 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1604 {                                                                          \
1605     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1606     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1607     intptr_t idx = simd_data(desc);                                        \
1608     TYPE *d = vd, *n = vn, *m = vm;                                        \
1609     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1610         TYPE mm = m[H(i + idx)];                                           \
1611         for (j = 0; j < segment; j++) {                                    \
1612             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1613         }                                                                  \
1614     }                                                                      \
1615     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1616 }
1617 
1618 #define nop(N, M, S) (M)
1619 
1620 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1621 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1622 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1623 
1624 #ifdef TARGET_AARCH64
1625 
1626 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1627 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1628 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1629 
1630 #endif
1631 
1632 #undef nop
1633 
1634 /*
1635  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1636  * the fused ops below they assume accumulate both from and into Vd.
1637  */
1638 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1639 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1640 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1641 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1642 
1643 #undef DO_FMUL_IDX
1644 
1645 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1646 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1647                   void *stat, uint32_t desc)                               \
1648 {                                                                          \
1649     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1650     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1651     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1652     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1653     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1654     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1655     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1656         TYPE mm = m[H(i + idx)];                                           \
1657         for (j = 0; j < segment; j++) {                                    \
1658             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1659                                      mm, a[i + j], 0, stat);               \
1660         }                                                                  \
1661     }                                                                      \
1662     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1663 }
1664 
1665 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1666 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1667 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1668 
1669 #undef DO_FMLA_IDX
1670 
1671 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1672 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1673 {                                                                          \
1674     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1675     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1676     bool q = false;                                                        \
1677     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1678         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1679         if (dd < MIN) {                                                    \
1680             dd = MIN;                                                      \
1681             q = true;                                                      \
1682         } else if (dd > MAX) {                                             \
1683             dd = MAX;                                                      \
1684             q = true;                                                      \
1685         }                                                                  \
1686         d[i] = dd;                                                         \
1687     }                                                                      \
1688     if (q) {                                                               \
1689         uint32_t *qc = vq;                                                 \
1690         qc[0] = 1;                                                         \
1691     }                                                                      \
1692     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1693 }
1694 
1695 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1696 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1697 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1698 
1699 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1700 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1701 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1702 
1703 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1704 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1705 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1706 
1707 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1708 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1709 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1710 
1711 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1712 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1713 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1714 
1715 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1716 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1717 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1718 
1719 #undef DO_SAT
1720 
1721 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1722                           void *vm, uint32_t desc)
1723 {
1724     intptr_t i, oprsz = simd_oprsz(desc);
1725     uint64_t *d = vd, *n = vn, *m = vm;
1726     bool q = false;
1727 
1728     for (i = 0; i < oprsz / 8; i++) {
1729         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1730         if (dd < nn) {
1731             dd = UINT64_MAX;
1732             q = true;
1733         }
1734         d[i] = dd;
1735     }
1736     if (q) {
1737         uint32_t *qc = vq;
1738         qc[0] = 1;
1739     }
1740     clear_tail(d, oprsz, simd_maxsz(desc));
1741 }
1742 
1743 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1744                           void *vm, uint32_t desc)
1745 {
1746     intptr_t i, oprsz = simd_oprsz(desc);
1747     uint64_t *d = vd, *n = vn, *m = vm;
1748     bool q = false;
1749 
1750     for (i = 0; i < oprsz / 8; i++) {
1751         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1752         if (nn < mm) {
1753             dd = 0;
1754             q = true;
1755         }
1756         d[i] = dd;
1757     }
1758     if (q) {
1759         uint32_t *qc = vq;
1760         qc[0] = 1;
1761     }
1762     clear_tail(d, oprsz, simd_maxsz(desc));
1763 }
1764 
1765 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1766                           void *vm, uint32_t desc)
1767 {
1768     intptr_t i, oprsz = simd_oprsz(desc);
1769     int64_t *d = vd, *n = vn, *m = vm;
1770     bool q = false;
1771 
1772     for (i = 0; i < oprsz / 8; i++) {
1773         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1774         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1775             dd = (nn >> 63) ^ ~INT64_MIN;
1776             q = true;
1777         }
1778         d[i] = dd;
1779     }
1780     if (q) {
1781         uint32_t *qc = vq;
1782         qc[0] = 1;
1783     }
1784     clear_tail(d, oprsz, simd_maxsz(desc));
1785 }
1786 
1787 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1788                           void *vm, uint32_t desc)
1789 {
1790     intptr_t i, oprsz = simd_oprsz(desc);
1791     int64_t *d = vd, *n = vn, *m = vm;
1792     bool q = false;
1793 
1794     for (i = 0; i < oprsz / 8; i++) {
1795         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1796         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1797             dd = (nn >> 63) ^ ~INT64_MIN;
1798             q = true;
1799         }
1800         d[i] = dd;
1801     }
1802     if (q) {
1803         uint32_t *qc = vq;
1804         qc[0] = 1;
1805     }
1806     clear_tail(d, oprsz, simd_maxsz(desc));
1807 }
1808 
1809 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1810                            void *vm, uint32_t desc)
1811 {
1812     intptr_t i, oprsz = simd_oprsz(desc);
1813     uint64_t *d = vd, *n = vn, *m = vm;
1814     bool q = false;
1815 
1816     for (i = 0; i < oprsz / 8; i++) {
1817         uint64_t nn = n[i];
1818         int64_t mm = m[i];
1819         uint64_t dd = nn + mm;
1820 
1821         if (mm < 0) {
1822             if (nn < (uint64_t)-mm) {
1823                 dd = 0;
1824                 q = true;
1825             }
1826         } else {
1827             if (dd < nn) {
1828                 dd = UINT64_MAX;
1829                 q = true;
1830             }
1831         }
1832         d[i] = dd;
1833     }
1834     if (q) {
1835         uint32_t *qc = vq;
1836         qc[0] = 1;
1837     }
1838     clear_tail(d, oprsz, simd_maxsz(desc));
1839 }
1840 
1841 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1842                            void *vm, uint32_t desc)
1843 {
1844     intptr_t i, oprsz = simd_oprsz(desc);
1845     uint64_t *d = vd, *n = vn, *m = vm;
1846     bool q = false;
1847 
1848     for (i = 0; i < oprsz / 8; i++) {
1849         int64_t nn = n[i];
1850         uint64_t mm = m[i];
1851         int64_t dd = nn + mm;
1852 
1853         if (mm > (uint64_t)(INT64_MAX - nn)) {
1854             dd = INT64_MAX;
1855             q = true;
1856         }
1857         d[i] = dd;
1858     }
1859     if (q) {
1860         uint32_t *qc = vq;
1861         qc[0] = 1;
1862     }
1863     clear_tail(d, oprsz, simd_maxsz(desc));
1864 }
1865 
1866 #define DO_SRA(NAME, TYPE)                              \
1867 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1868 {                                                       \
1869     intptr_t i, oprsz = simd_oprsz(desc);               \
1870     int shift = simd_data(desc);                        \
1871     TYPE *d = vd, *n = vn;                              \
1872     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1873         d[i] += n[i] >> shift;                          \
1874     }                                                   \
1875     clear_tail(d, oprsz, simd_maxsz(desc));             \
1876 }
1877 
1878 DO_SRA(gvec_ssra_b, int8_t)
1879 DO_SRA(gvec_ssra_h, int16_t)
1880 DO_SRA(gvec_ssra_s, int32_t)
1881 DO_SRA(gvec_ssra_d, int64_t)
1882 
1883 DO_SRA(gvec_usra_b, uint8_t)
1884 DO_SRA(gvec_usra_h, uint16_t)
1885 DO_SRA(gvec_usra_s, uint32_t)
1886 DO_SRA(gvec_usra_d, uint64_t)
1887 
1888 #undef DO_SRA
1889 
1890 #define DO_RSHR(NAME, TYPE)                             \
1891 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1892 {                                                       \
1893     intptr_t i, oprsz = simd_oprsz(desc);               \
1894     int shift = simd_data(desc);                        \
1895     TYPE *d = vd, *n = vn;                              \
1896     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1897         TYPE tmp = n[i] >> (shift - 1);                 \
1898         d[i] = (tmp >> 1) + (tmp & 1);                  \
1899     }                                                   \
1900     clear_tail(d, oprsz, simd_maxsz(desc));             \
1901 }
1902 
1903 DO_RSHR(gvec_srshr_b, int8_t)
1904 DO_RSHR(gvec_srshr_h, int16_t)
1905 DO_RSHR(gvec_srshr_s, int32_t)
1906 DO_RSHR(gvec_srshr_d, int64_t)
1907 
1908 DO_RSHR(gvec_urshr_b, uint8_t)
1909 DO_RSHR(gvec_urshr_h, uint16_t)
1910 DO_RSHR(gvec_urshr_s, uint32_t)
1911 DO_RSHR(gvec_urshr_d, uint64_t)
1912 
1913 #undef DO_RSHR
1914 
1915 #define DO_RSRA(NAME, TYPE)                             \
1916 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1917 {                                                       \
1918     intptr_t i, oprsz = simd_oprsz(desc);               \
1919     int shift = simd_data(desc);                        \
1920     TYPE *d = vd, *n = vn;                              \
1921     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1922         TYPE tmp = n[i] >> (shift - 1);                 \
1923         d[i] += (tmp >> 1) + (tmp & 1);                 \
1924     }                                                   \
1925     clear_tail(d, oprsz, simd_maxsz(desc));             \
1926 }
1927 
1928 DO_RSRA(gvec_srsra_b, int8_t)
1929 DO_RSRA(gvec_srsra_h, int16_t)
1930 DO_RSRA(gvec_srsra_s, int32_t)
1931 DO_RSRA(gvec_srsra_d, int64_t)
1932 
1933 DO_RSRA(gvec_ursra_b, uint8_t)
1934 DO_RSRA(gvec_ursra_h, uint16_t)
1935 DO_RSRA(gvec_ursra_s, uint32_t)
1936 DO_RSRA(gvec_ursra_d, uint64_t)
1937 
1938 #undef DO_RSRA
1939 
1940 #define DO_SRI(NAME, TYPE)                              \
1941 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1942 {                                                       \
1943     intptr_t i, oprsz = simd_oprsz(desc);               \
1944     int shift = simd_data(desc);                        \
1945     TYPE *d = vd, *n = vn;                              \
1946     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1947         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1948     }                                                   \
1949     clear_tail(d, oprsz, simd_maxsz(desc));             \
1950 }
1951 
1952 DO_SRI(gvec_sri_b, uint8_t)
1953 DO_SRI(gvec_sri_h, uint16_t)
1954 DO_SRI(gvec_sri_s, uint32_t)
1955 DO_SRI(gvec_sri_d, uint64_t)
1956 
1957 #undef DO_SRI
1958 
1959 #define DO_SLI(NAME, TYPE)                              \
1960 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1961 {                                                       \
1962     intptr_t i, oprsz = simd_oprsz(desc);               \
1963     int shift = simd_data(desc);                        \
1964     TYPE *d = vd, *n = vn;                              \
1965     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1966         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1967     }                                                   \
1968     clear_tail(d, oprsz, simd_maxsz(desc));             \
1969 }
1970 
1971 DO_SLI(gvec_sli_b, uint8_t)
1972 DO_SLI(gvec_sli_h, uint16_t)
1973 DO_SLI(gvec_sli_s, uint32_t)
1974 DO_SLI(gvec_sli_d, uint64_t)
1975 
1976 #undef DO_SLI
1977 
1978 /*
1979  * Convert float16 to float32, raising no exceptions and
1980  * preserving exceptional values, including SNaN.
1981  * This is effectively an unpack+repack operation.
1982  */
1983 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1984 {
1985     const int f16_bias = 15;
1986     const int f32_bias = 127;
1987     uint32_t sign = extract32(f16, 15, 1);
1988     uint32_t exp = extract32(f16, 10, 5);
1989     uint32_t frac = extract32(f16, 0, 10);
1990 
1991     if (exp == 0x1f) {
1992         /* Inf or NaN */
1993         exp = 0xff;
1994     } else if (exp == 0) {
1995         /* Zero or denormal.  */
1996         if (frac != 0) {
1997             if (fz16) {
1998                 frac = 0;
1999             } else {
2000                 /*
2001                  * Denormal; these are all normal float32.
2002                  * Shift the fraction so that the msb is at bit 11,
2003                  * then remove bit 11 as the implicit bit of the
2004                  * normalized float32.  Note that we still go through
2005                  * the shift for normal numbers below, to put the
2006                  * float32 fraction at the right place.
2007                  */
2008                 int shift = clz32(frac) - 21;
2009                 frac = (frac << shift) & 0x3ff;
2010                 exp = f32_bias - f16_bias - shift + 1;
2011             }
2012         }
2013     } else {
2014         /* Normal number; adjust the bias.  */
2015         exp += f32_bias - f16_bias;
2016     }
2017     sign <<= 31;
2018     exp <<= 23;
2019     frac <<= 23 - 10;
2020 
2021     return sign | exp | frac;
2022 }
2023 
2024 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2025 {
2026     /*
2027      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2028      * Load the 2nd qword iff is_q & is_2.
2029      * Shift to the 2nd dword iff !is_q & is_2.
2030      * For !is_q & !is_2, the upper bits of the result are garbage.
2031      */
2032     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2033 }
2034 
2035 /*
2036  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2037  * as there is not yet SVE versions that might use blocking.
2038  */
2039 
2040 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2041                      uint32_t desc, bool fz16)
2042 {
2043     intptr_t i, oprsz = simd_oprsz(desc);
2044     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2045     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2046     int is_q = oprsz == 16;
2047     uint64_t n_4, m_4;
2048 
2049     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2050     n_4 = load4_f16(vn, is_q, is_2);
2051     m_4 = load4_f16(vm, is_q, is_2);
2052 
2053     /* Negate all inputs for FMLSL at once.  */
2054     if (is_s) {
2055         n_4 ^= 0x8000800080008000ull;
2056     }
2057 
2058     for (i = 0; i < oprsz / 4; i++) {
2059         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2060         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2061         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2062     }
2063     clear_tail(d, oprsz, simd_maxsz(desc));
2064 }
2065 
2066 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2067                             void *venv, uint32_t desc)
2068 {
2069     CPUARMState *env = venv;
2070     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2071              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2072 }
2073 
2074 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2075                             void *venv, uint32_t desc)
2076 {
2077     CPUARMState *env = venv;
2078     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2079              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2080 }
2081 
2082 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2083                                void *venv, uint32_t desc)
2084 {
2085     intptr_t i, oprsz = simd_oprsz(desc);
2086     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2087     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2088     CPUARMState *env = venv;
2089     float_status *status = &env->vfp.fp_status;
2090     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2091 
2092     for (i = 0; i < oprsz; i += sizeof(float32)) {
2093         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2094         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2095         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2096         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2097         float32 aa = *(float32 *)(va + H1_4(i));
2098 
2099         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2100     }
2101 }
2102 
2103 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2104                          uint32_t desc, bool fz16)
2105 {
2106     intptr_t i, oprsz = simd_oprsz(desc);
2107     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2108     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2109     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2110     int is_q = oprsz == 16;
2111     uint64_t n_4;
2112     float32 m_1;
2113 
2114     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2115     n_4 = load4_f16(vn, is_q, is_2);
2116 
2117     /* Negate all inputs for FMLSL at once.  */
2118     if (is_s) {
2119         n_4 ^= 0x8000800080008000ull;
2120     }
2121 
2122     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2123 
2124     for (i = 0; i < oprsz / 4; i++) {
2125         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2126         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2127     }
2128     clear_tail(d, oprsz, simd_maxsz(desc));
2129 }
2130 
2131 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2132                                 void *venv, uint32_t desc)
2133 {
2134     CPUARMState *env = venv;
2135     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2136                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2137 }
2138 
2139 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2140                                 void *venv, uint32_t desc)
2141 {
2142     CPUARMState *env = venv;
2143     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2144                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2145 }
2146 
2147 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2148                                void *venv, uint32_t desc)
2149 {
2150     intptr_t i, j, oprsz = simd_oprsz(desc);
2151     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2152     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2153     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2154     CPUARMState *env = venv;
2155     float_status *status = &env->vfp.fp_status;
2156     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2157 
2158     for (i = 0; i < oprsz; i += 16) {
2159         float16 mm_16 = *(float16 *)(vm + i + idx);
2160         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2161 
2162         for (j = 0; j < 16; j += sizeof(float32)) {
2163             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2164             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2165             float32 aa = *(float32 *)(va + H1_4(i + j));
2166 
2167             *(float32 *)(vd + H1_4(i + j)) =
2168                 float32_muladd(nn, mm, aa, 0, status);
2169         }
2170     }
2171 }
2172 
2173 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2174 {
2175     intptr_t i, opr_sz = simd_oprsz(desc);
2176     int8_t *d = vd, *n = vn, *m = vm;
2177 
2178     for (i = 0; i < opr_sz; ++i) {
2179         int8_t mm = m[i];
2180         int8_t nn = n[i];
2181         int8_t res = 0;
2182         if (mm >= 0) {
2183             if (mm < 8) {
2184                 res = nn << mm;
2185             }
2186         } else {
2187             res = nn >> (mm > -8 ? -mm : 7);
2188         }
2189         d[i] = res;
2190     }
2191     clear_tail(d, opr_sz, simd_maxsz(desc));
2192 }
2193 
2194 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2195 {
2196     intptr_t i, opr_sz = simd_oprsz(desc);
2197     int16_t *d = vd, *n = vn, *m = vm;
2198 
2199     for (i = 0; i < opr_sz / 2; ++i) {
2200         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2201         int16_t nn = n[i];
2202         int16_t res = 0;
2203         if (mm >= 0) {
2204             if (mm < 16) {
2205                 res = nn << mm;
2206             }
2207         } else {
2208             res = nn >> (mm > -16 ? -mm : 15);
2209         }
2210         d[i] = res;
2211     }
2212     clear_tail(d, opr_sz, simd_maxsz(desc));
2213 }
2214 
2215 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2216 {
2217     intptr_t i, opr_sz = simd_oprsz(desc);
2218     uint8_t *d = vd, *n = vn, *m = vm;
2219 
2220     for (i = 0; i < opr_sz; ++i) {
2221         int8_t mm = m[i];
2222         uint8_t nn = n[i];
2223         uint8_t res = 0;
2224         if (mm >= 0) {
2225             if (mm < 8) {
2226                 res = nn << mm;
2227             }
2228         } else {
2229             if (mm > -8) {
2230                 res = nn >> -mm;
2231             }
2232         }
2233         d[i] = res;
2234     }
2235     clear_tail(d, opr_sz, simd_maxsz(desc));
2236 }
2237 
2238 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2239 {
2240     intptr_t i, opr_sz = simd_oprsz(desc);
2241     uint16_t *d = vd, *n = vn, *m = vm;
2242 
2243     for (i = 0; i < opr_sz / 2; ++i) {
2244         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2245         uint16_t nn = n[i];
2246         uint16_t res = 0;
2247         if (mm >= 0) {
2248             if (mm < 16) {
2249                 res = nn << mm;
2250             }
2251         } else {
2252             if (mm > -16) {
2253                 res = nn >> -mm;
2254             }
2255         }
2256         d[i] = res;
2257     }
2258     clear_tail(d, opr_sz, simd_maxsz(desc));
2259 }
2260 
2261 /*
2262  * 8x8->8 polynomial multiply.
2263  *
2264  * Polynomial multiplication is like integer multiplication except the
2265  * partial products are XORed, not added.
2266  *
2267  * TODO: expose this as a generic vector operation, as it is a common
2268  * crypto building block.
2269  */
2270 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2271 {
2272     intptr_t i, opr_sz = simd_oprsz(desc);
2273     uint64_t *d = vd, *n = vn, *m = vm;
2274 
2275     for (i = 0; i < opr_sz / 8; ++i) {
2276         d[i] = clmul_8x8_low(n[i], m[i]);
2277     }
2278     clear_tail(d, opr_sz, simd_maxsz(desc));
2279 }
2280 
2281 /*
2282  * 64x64->128 polynomial multiply.
2283  * Because of the lanes are not accessed in strict columns,
2284  * this probably cannot be turned into a generic helper.
2285  */
2286 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2287 {
2288     intptr_t i, opr_sz = simd_oprsz(desc);
2289     intptr_t hi = simd_data(desc);
2290     uint64_t *d = vd, *n = vn, *m = vm;
2291 
2292     for (i = 0; i < opr_sz / 8; i += 2) {
2293         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2294         d[i] = int128_getlo(r);
2295         d[i + 1] = int128_gethi(r);
2296     }
2297     clear_tail(d, opr_sz, simd_maxsz(desc));
2298 }
2299 
2300 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2301 {
2302     int hi = simd_data(desc);
2303     uint64_t *d = vd, *n = vn, *m = vm;
2304     uint64_t nn = n[hi], mm = m[hi];
2305 
2306     d[0] = clmul_8x4_packed(nn, mm);
2307     nn >>= 32;
2308     mm >>= 32;
2309     d[1] = clmul_8x4_packed(nn, mm);
2310 
2311     clear_tail(d, 16, simd_maxsz(desc));
2312 }
2313 
2314 #ifdef TARGET_AARCH64
2315 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2316 {
2317     int shift = simd_data(desc) * 8;
2318     intptr_t i, opr_sz = simd_oprsz(desc);
2319     uint64_t *d = vd, *n = vn, *m = vm;
2320 
2321     for (i = 0; i < opr_sz / 8; ++i) {
2322         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2323     }
2324 }
2325 
2326 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2327 {
2328     intptr_t sel = H4(simd_data(desc));
2329     intptr_t i, opr_sz = simd_oprsz(desc);
2330     uint32_t *n = vn, *m = vm;
2331     uint64_t *d = vd;
2332 
2333     for (i = 0; i < opr_sz / 8; ++i) {
2334         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2335     }
2336 }
2337 #endif
2338 
2339 #define DO_CMP0(NAME, TYPE, OP)                         \
2340 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2341 {                                                       \
2342     intptr_t i, opr_sz = simd_oprsz(desc);              \
2343     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2344         TYPE nn = *(TYPE *)(vn + i);                    \
2345         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2346     }                                                   \
2347     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2348 }
2349 
2350 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2351 DO_CMP0(gvec_clt0_b, int8_t, <)
2352 DO_CMP0(gvec_cle0_b, int8_t, <=)
2353 DO_CMP0(gvec_cgt0_b, int8_t, >)
2354 DO_CMP0(gvec_cge0_b, int8_t, >=)
2355 
2356 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2357 DO_CMP0(gvec_clt0_h, int16_t, <)
2358 DO_CMP0(gvec_cle0_h, int16_t, <=)
2359 DO_CMP0(gvec_cgt0_h, int16_t, >)
2360 DO_CMP0(gvec_cge0_h, int16_t, >=)
2361 
2362 #undef DO_CMP0
2363 
2364 #define DO_ABD(NAME, TYPE)                                      \
2365 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2366 {                                                               \
2367     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2368     TYPE *d = vd, *n = vn, *m = vm;                             \
2369                                                                 \
2370     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2371         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2372     }                                                           \
2373     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2374 }
2375 
2376 DO_ABD(gvec_sabd_b, int8_t)
2377 DO_ABD(gvec_sabd_h, int16_t)
2378 DO_ABD(gvec_sabd_s, int32_t)
2379 DO_ABD(gvec_sabd_d, int64_t)
2380 
2381 DO_ABD(gvec_uabd_b, uint8_t)
2382 DO_ABD(gvec_uabd_h, uint16_t)
2383 DO_ABD(gvec_uabd_s, uint32_t)
2384 DO_ABD(gvec_uabd_d, uint64_t)
2385 
2386 #undef DO_ABD
2387 
2388 #define DO_ABA(NAME, TYPE)                                      \
2389 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2390 {                                                               \
2391     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2392     TYPE *d = vd, *n = vn, *m = vm;                             \
2393                                                                 \
2394     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2395         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2396     }                                                           \
2397     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2398 }
2399 
2400 DO_ABA(gvec_saba_b, int8_t)
2401 DO_ABA(gvec_saba_h, int16_t)
2402 DO_ABA(gvec_saba_s, int32_t)
2403 DO_ABA(gvec_saba_d, int64_t)
2404 
2405 DO_ABA(gvec_uaba_b, uint8_t)
2406 DO_ABA(gvec_uaba_h, uint16_t)
2407 DO_ABA(gvec_uaba_s, uint32_t)
2408 DO_ABA(gvec_uaba_d, uint64_t)
2409 
2410 #undef DO_ABA
2411 
2412 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2413 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2414 {                                                                          \
2415     ARMVectorReg scratch;                                                  \
2416     intptr_t oprsz = simd_oprsz(desc);                                     \
2417     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2418     TYPE *d = vd, *n = vn, *m = vm;                                        \
2419     if (unlikely(d == m)) {                                                \
2420         m = memcpy(&scratch, m, oprsz);                                    \
2421     }                                                                      \
2422     for (intptr_t i = 0; i < half; ++i) {                                  \
2423         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2424     }                                                                      \
2425     for (intptr_t i = 0; i < half; ++i) {                                  \
2426         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2427     }                                                                      \
2428     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2429 }
2430 
2431 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2432 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2433 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2434 
2435 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2436 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2437 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2438 
2439 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2440 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2441 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2442 
2443 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2444 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2445 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2446 
2447 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2448 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2449 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2450 
2451 #undef DO_3OP_PAIR
2452 
2453 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2454 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2455 {                                                               \
2456     ARMVectorReg scratch;                                       \
2457     intptr_t oprsz = simd_oprsz(desc);                          \
2458     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2459     TYPE *d = vd, *n = vn, *m = vm;                             \
2460     if (unlikely(d == m)) {                                     \
2461         m = memcpy(&scratch, m, oprsz);                         \
2462     }                                                           \
2463     for (intptr_t i = 0; i < half; ++i) {                       \
2464         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2465     }                                                           \
2466     for (intptr_t i = 0; i < half; ++i) {                       \
2467         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2468     }                                                           \
2469     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2470 }
2471 
2472 #define ADD(A, B) (A + B)
2473 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2474 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2475 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2476 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2477 #undef  ADD
2478 
2479 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2480 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2481 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2482 
2483 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2484 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2485 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2486 
2487 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2488 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2489 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2490 
2491 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2492 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2493 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2494 
2495 #undef DO_3OP_PAIR
2496 
2497 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2498     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2499     {                                                                   \
2500         intptr_t i, oprsz = simd_oprsz(desc);                           \
2501         int shift = simd_data(desc);                                    \
2502         TYPE *d = vd, *n = vn;                                          \
2503         float_status *fpst = stat;                                      \
2504         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2505             d[i] = FUNC(n[i], shift, fpst);                             \
2506         }                                                               \
2507         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2508     }
2509 
2510 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2511 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2512 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2513 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2514 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2515 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2516 
2517 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2518 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2519 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2520 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2521 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2522 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2523 
2524 #undef DO_VCVT_FIXED
2525 
2526 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2527     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2528     {                                                                   \
2529         float_status *fpst = stat;                                      \
2530         intptr_t i, oprsz = simd_oprsz(desc);                           \
2531         uint32_t rmode = simd_data(desc);                               \
2532         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2533         TYPE *d = vd, *n = vn;                                          \
2534         set_float_rounding_mode(rmode, fpst);                           \
2535         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2536             d[i] = FUNC(n[i], 0, fpst);                                 \
2537         }                                                               \
2538         set_float_rounding_mode(prev_rmode, fpst);                      \
2539         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2540     }
2541 
2542 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2543 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2544 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2545 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2546 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2547 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2548 
2549 #undef DO_VCVT_RMODE
2550 
2551 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2552     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2553     {                                                                   \
2554         float_status *fpst = stat;                                      \
2555         intptr_t i, oprsz = simd_oprsz(desc);                           \
2556         uint32_t rmode = simd_data(desc);                               \
2557         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2558         TYPE *d = vd, *n = vn;                                          \
2559         set_float_rounding_mode(rmode, fpst);                           \
2560         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2561             d[i] = FUNC(n[i], fpst);                                    \
2562         }                                                               \
2563         set_float_rounding_mode(prev_rmode, fpst);                      \
2564         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2565     }
2566 
2567 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2568 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2569 
2570 #undef DO_VRINT_RMODE
2571 
2572 #ifdef TARGET_AARCH64
2573 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2574 {
2575     const uint8_t *indices = vm;
2576     CPUARMState *env = venv;
2577     size_t oprsz = simd_oprsz(desc);
2578     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2579     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2580     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2581     union {
2582         uint8_t b[16];
2583         uint64_t d[2];
2584     } result;
2585 
2586     /*
2587      * We must construct the final result in a temp, lest the output
2588      * overlaps the input table.  For TBL, begin with zero; for TBX,
2589      * begin with the original register contents.  Note that we always
2590      * copy 16 bytes here to avoid an extra branch; clearing the high
2591      * bits of the register for oprsz == 8 is handled below.
2592      */
2593     if (is_tbx) {
2594         memcpy(&result, vd, 16);
2595     } else {
2596         memset(&result, 0, 16);
2597     }
2598 
2599     for (size_t i = 0; i < oprsz; ++i) {
2600         uint32_t index = indices[H1(i)];
2601 
2602         if (index < table_len) {
2603             /*
2604              * Convert index (a byte offset into the virtual table
2605              * which is a series of 128-bit vectors concatenated)
2606              * into the correct register element, bearing in mind
2607              * that the table can wrap around from V31 to V0.
2608              */
2609             const uint8_t *table = (const uint8_t *)
2610                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2611             result.b[H1(i)] = table[H1(index % 16)];
2612         }
2613     }
2614 
2615     memcpy(vd, &result, 16);
2616     clear_tail(vd, oprsz, simd_maxsz(desc));
2617 }
2618 #endif
2619 
2620 /*
2621  * NxN -> N highpart multiply
2622  *
2623  * TODO: expose this as a generic vector operation.
2624  */
2625 
2626 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2627 {
2628     intptr_t i, opr_sz = simd_oprsz(desc);
2629     int8_t *d = vd, *n = vn, *m = vm;
2630 
2631     for (i = 0; i < opr_sz; ++i) {
2632         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2633     }
2634     clear_tail(d, opr_sz, simd_maxsz(desc));
2635 }
2636 
2637 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2638 {
2639     intptr_t i, opr_sz = simd_oprsz(desc);
2640     int16_t *d = vd, *n = vn, *m = vm;
2641 
2642     for (i = 0; i < opr_sz / 2; ++i) {
2643         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2644     }
2645     clear_tail(d, opr_sz, simd_maxsz(desc));
2646 }
2647 
2648 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2649 {
2650     intptr_t i, opr_sz = simd_oprsz(desc);
2651     int32_t *d = vd, *n = vn, *m = vm;
2652 
2653     for (i = 0; i < opr_sz / 4; ++i) {
2654         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2655     }
2656     clear_tail(d, opr_sz, simd_maxsz(desc));
2657 }
2658 
2659 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2660 {
2661     intptr_t i, opr_sz = simd_oprsz(desc);
2662     uint64_t *d = vd, *n = vn, *m = vm;
2663     uint64_t discard;
2664 
2665     for (i = 0; i < opr_sz / 8; ++i) {
2666         muls64(&discard, &d[i], n[i], m[i]);
2667     }
2668     clear_tail(d, opr_sz, simd_maxsz(desc));
2669 }
2670 
2671 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2672 {
2673     intptr_t i, opr_sz = simd_oprsz(desc);
2674     uint8_t *d = vd, *n = vn, *m = vm;
2675 
2676     for (i = 0; i < opr_sz; ++i) {
2677         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2678     }
2679     clear_tail(d, opr_sz, simd_maxsz(desc));
2680 }
2681 
2682 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2683 {
2684     intptr_t i, opr_sz = simd_oprsz(desc);
2685     uint16_t *d = vd, *n = vn, *m = vm;
2686 
2687     for (i = 0; i < opr_sz / 2; ++i) {
2688         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2689     }
2690     clear_tail(d, opr_sz, simd_maxsz(desc));
2691 }
2692 
2693 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2694 {
2695     intptr_t i, opr_sz = simd_oprsz(desc);
2696     uint32_t *d = vd, *n = vn, *m = vm;
2697 
2698     for (i = 0; i < opr_sz / 4; ++i) {
2699         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2700     }
2701     clear_tail(d, opr_sz, simd_maxsz(desc));
2702 }
2703 
2704 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2705 {
2706     intptr_t i, opr_sz = simd_oprsz(desc);
2707     uint64_t *d = vd, *n = vn, *m = vm;
2708     uint64_t discard;
2709 
2710     for (i = 0; i < opr_sz / 8; ++i) {
2711         mulu64(&discard, &d[i], n[i], m[i]);
2712     }
2713     clear_tail(d, opr_sz, simd_maxsz(desc));
2714 }
2715 
2716 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2717 {
2718     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2719     int shr = simd_data(desc);
2720     uint64_t *d = vd, *n = vn, *m = vm;
2721 
2722     for (i = 0; i < opr_sz; ++i) {
2723         d[i] = ror64(n[i] ^ m[i], shr);
2724     }
2725     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2726 }
2727 
2728 /*
2729  * Integer matrix-multiply accumulate
2730  */
2731 
2732 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2733 {
2734     int8_t *n = vn, *m = vm;
2735 
2736     for (intptr_t k = 0; k < 8; ++k) {
2737         sum += n[H1(k)] * m[H1(k)];
2738     }
2739     return sum;
2740 }
2741 
2742 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2743 {
2744     uint8_t *n = vn, *m = vm;
2745 
2746     for (intptr_t k = 0; k < 8; ++k) {
2747         sum += n[H1(k)] * m[H1(k)];
2748     }
2749     return sum;
2750 }
2751 
2752 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2753 {
2754     uint8_t *n = vn;
2755     int8_t *m = vm;
2756 
2757     for (intptr_t k = 0; k < 8; ++k) {
2758         sum += n[H1(k)] * m[H1(k)];
2759     }
2760     return sum;
2761 }
2762 
2763 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2764                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2765 {
2766     intptr_t seg, opr_sz = simd_oprsz(desc);
2767 
2768     for (seg = 0; seg < opr_sz; seg += 16) {
2769         uint32_t *d = vd + seg;
2770         uint32_t *a = va + seg;
2771         uint32_t sum0, sum1, sum2, sum3;
2772 
2773         /*
2774          * Process the entire segment at once, writing back the
2775          * results only after we've consumed all of the inputs.
2776          *
2777          * Key to indices by column:
2778          *          i   j                  i             j
2779          */
2780         sum0 = a[H4(0 + 0)];
2781         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2782         sum1 = a[H4(0 + 1)];
2783         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2784         sum2 = a[H4(2 + 0)];
2785         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2786         sum3 = a[H4(2 + 1)];
2787         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2788 
2789         d[H4(0)] = sum0;
2790         d[H4(1)] = sum1;
2791         d[H4(2)] = sum2;
2792         d[H4(3)] = sum3;
2793     }
2794     clear_tail(vd, opr_sz, simd_maxsz(desc));
2795 }
2796 
2797 #define DO_MMLA_B(NAME, INNER) \
2798     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2799     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2800 
2801 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2802 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2803 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2804 
2805 /*
2806  * BFloat16 Dot Product
2807  */
2808 
2809 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2810 {
2811     /*
2812      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2813      * For EBF = 0, we ignore the FPCR bits which determine rounding
2814      * mode and denormal-flushing, and we do unfused multiplies and
2815      * additions with intermediate rounding of all products and sums.
2816      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2817      * and we perform a fused two-way sum-of-products without intermediate
2818      * rounding of the products.
2819      * In either case, we don't set fp exception flags.
2820      *
2821      * EBF is AArch64 only, so even if it's set in the FPCR it has
2822      * no effect on AArch32 instructions.
2823      */
2824     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2825 
2826     *statusp = env->vfp.fp_status;
2827     set_default_nan_mode(true, statusp);
2828 
2829     if (ebf) {
2830         /* EBF=1 needs to do a step with round-to-odd semantics */
2831         *oddstatusp = *statusp;
2832         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2833     } else {
2834         set_flush_to_zero(true, statusp);
2835         set_flush_inputs_to_zero(true, statusp);
2836         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2837     }
2838     return ebf;
2839 }
2840 
2841 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2842 {
2843     float32 t1, t2;
2844 
2845     /*
2846      * Extract each BFloat16 from the element pair, and shift
2847      * them such that they become float32.
2848      */
2849     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2850     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2851     t1 = float32_add(t1, t2, fpst);
2852     t1 = float32_add(sum, t1, fpst);
2853 
2854     return t1;
2855 }
2856 
2857 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2858                      float_status *fpst, float_status *fpst_odd)
2859 {
2860     /*
2861      * Compare f16_dotadd() in sme_helper.c, but here we have
2862      * bfloat16 inputs. In particular that means that we do not
2863      * want the FPCR.FZ16 flush semantics, so we use the normal
2864      * float_status for the input handling here.
2865      */
2866     float64 e1r = float32_to_float64(e1 << 16, fpst);
2867     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2868     float64 e2r = float32_to_float64(e2 << 16, fpst);
2869     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2870     float64 t64;
2871     float32 t32;
2872 
2873     /*
2874      * The ARM pseudocode function FPDot performs both multiplies
2875      * and the add with a single rounding operation.  Emulate this
2876      * by performing the first multiply in round-to-odd, then doing
2877      * the second multiply as fused multiply-add, and rounding to
2878      * float32 all in one step.
2879      */
2880     t64 = float64_mul(e1r, e2r, fpst_odd);
2881     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2882 
2883     /* This conversion is exact, because we've already rounded. */
2884     t32 = float64_to_float32(t64, fpst);
2885 
2886     /* The final accumulation step is not fused. */
2887     return float32_add(sum, t32, fpst);
2888 }
2889 
2890 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2891                         CPUARMState *env, uint32_t desc)
2892 {
2893     intptr_t i, opr_sz = simd_oprsz(desc);
2894     float32 *d = vd, *a = va;
2895     uint32_t *n = vn, *m = vm;
2896     float_status fpst, fpst_odd;
2897 
2898     if (is_ebf(env, &fpst, &fpst_odd)) {
2899         for (i = 0; i < opr_sz / 4; ++i) {
2900             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2901         }
2902     } else {
2903         for (i = 0; i < opr_sz / 4; ++i) {
2904             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2905         }
2906     }
2907     clear_tail(d, opr_sz, simd_maxsz(desc));
2908 }
2909 
2910 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2911                             void *va, CPUARMState *env, uint32_t desc)
2912 {
2913     intptr_t i, j, opr_sz = simd_oprsz(desc);
2914     intptr_t index = simd_data(desc);
2915     intptr_t elements = opr_sz / 4;
2916     intptr_t eltspersegment = MIN(16 / 4, elements);
2917     float32 *d = vd, *a = va;
2918     uint32_t *n = vn, *m = vm;
2919     float_status fpst, fpst_odd;
2920 
2921     if (is_ebf(env, &fpst, &fpst_odd)) {
2922         for (i = 0; i < elements; i += eltspersegment) {
2923             uint32_t m_idx = m[i + H4(index)];
2924 
2925             for (j = i; j < i + eltspersegment; j++) {
2926                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2927             }
2928         }
2929     } else {
2930         for (i = 0; i < elements; i += eltspersegment) {
2931             uint32_t m_idx = m[i + H4(index)];
2932 
2933             for (j = i; j < i + eltspersegment; j++) {
2934                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2935             }
2936         }
2937     }
2938     clear_tail(d, opr_sz, simd_maxsz(desc));
2939 }
2940 
2941 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2942                          CPUARMState *env, uint32_t desc)
2943 {
2944     intptr_t s, opr_sz = simd_oprsz(desc);
2945     float32 *d = vd, *a = va;
2946     uint32_t *n = vn, *m = vm;
2947     float_status fpst, fpst_odd;
2948 
2949     if (is_ebf(env, &fpst, &fpst_odd)) {
2950         for (s = 0; s < opr_sz / 4; s += 4) {
2951             float32 sum00, sum01, sum10, sum11;
2952 
2953             /*
2954              * Process the entire segment at once, writing back the
2955              * results only after we've consumed all of the inputs.
2956              *
2957              * Key to indices by column:
2958              *               i   j               i   k             j   k
2959              */
2960             sum00 = a[s + H4(0 + 0)];
2961             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2962             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2963 
2964             sum01 = a[s + H4(0 + 1)];
2965             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2966             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2967 
2968             sum10 = a[s + H4(2 + 0)];
2969             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2970             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2971 
2972             sum11 = a[s + H4(2 + 1)];
2973             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2974             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2975 
2976             d[s + H4(0 + 0)] = sum00;
2977             d[s + H4(0 + 1)] = sum01;
2978             d[s + H4(2 + 0)] = sum10;
2979             d[s + H4(2 + 1)] = sum11;
2980         }
2981     } else {
2982         for (s = 0; s < opr_sz / 4; s += 4) {
2983             float32 sum00, sum01, sum10, sum11;
2984 
2985             /*
2986              * Process the entire segment at once, writing back the
2987              * results only after we've consumed all of the inputs.
2988              *
2989              * Key to indices by column:
2990              *               i   j           i   k             j   k
2991              */
2992             sum00 = a[s + H4(0 + 0)];
2993             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2994             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2995 
2996             sum01 = a[s + H4(0 + 1)];
2997             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2998             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2999 
3000             sum10 = a[s + H4(2 + 0)];
3001             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3002             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3003 
3004             sum11 = a[s + H4(2 + 1)];
3005             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3006             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3007 
3008             d[s + H4(0 + 0)] = sum00;
3009             d[s + H4(0 + 1)] = sum01;
3010             d[s + H4(2 + 0)] = sum10;
3011             d[s + H4(2 + 1)] = sum11;
3012         }
3013     }
3014     clear_tail(d, opr_sz, simd_maxsz(desc));
3015 }
3016 
3017 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3018                          void *stat, uint32_t desc)
3019 {
3020     intptr_t i, opr_sz = simd_oprsz(desc);
3021     intptr_t sel = simd_data(desc);
3022     float32 *d = vd, *a = va;
3023     bfloat16 *n = vn, *m = vm;
3024 
3025     for (i = 0; i < opr_sz / 4; ++i) {
3026         float32 nn = n[H2(i * 2 + sel)] << 16;
3027         float32 mm = m[H2(i * 2 + sel)] << 16;
3028         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3029     }
3030     clear_tail(d, opr_sz, simd_maxsz(desc));
3031 }
3032 
3033 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3034                              void *va, void *stat, uint32_t desc)
3035 {
3036     intptr_t i, j, opr_sz = simd_oprsz(desc);
3037     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3038     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3039     intptr_t elements = opr_sz / 4;
3040     intptr_t eltspersegment = MIN(16 / 4, elements);
3041     float32 *d = vd, *a = va;
3042     bfloat16 *n = vn, *m = vm;
3043 
3044     for (i = 0; i < elements; i += eltspersegment) {
3045         float32 m_idx = m[H2(2 * i + index)] << 16;
3046 
3047         for (j = i; j < i + eltspersegment; j++) {
3048             float32 n_j = n[H2(2 * j + sel)] << 16;
3049             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3050         }
3051     }
3052     clear_tail(d, opr_sz, simd_maxsz(desc));
3053 }
3054 
3055 #define DO_CLAMP(NAME, TYPE) \
3056 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3057 {                                                                       \
3058     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3059     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3060         TYPE aa = *(TYPE *)(a + i);                                     \
3061         TYPE nn = *(TYPE *)(n + i);                                     \
3062         TYPE mm = *(TYPE *)(m + i);                                     \
3063         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3064         *(TYPE *)(d + i) = dd;                                          \
3065     }                                                                   \
3066     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3067 }
3068 
3069 DO_CLAMP(gvec_sclamp_b, int8_t)
3070 DO_CLAMP(gvec_sclamp_h, int16_t)
3071 DO_CLAMP(gvec_sclamp_s, int32_t)
3072 DO_CLAMP(gvec_sclamp_d, int64_t)
3073 
3074 DO_CLAMP(gvec_uclamp_b, uint8_t)
3075 DO_CLAMP(gvec_uclamp_h, uint16_t)
3076 DO_CLAMP(gvec_uclamp_s, uint32_t)
3077 DO_CLAMP(gvec_uclamp_d, uint64_t)
3078 
3079 /* Bit count in each 8-bit word. */
3080 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3081 {
3082     intptr_t i, opr_sz = simd_oprsz(desc);
3083     uint8_t *d = vd, *n = vn;
3084 
3085     for (i = 0; i < opr_sz; ++i) {
3086         d[i] = ctpop8(n[i]);
3087     }
3088     clear_tail(d, opr_sz, simd_maxsz(desc));
3089 }
3090 
3091 /* Reverse bits in each 8 bit word */
3092 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3093 {
3094     intptr_t i, opr_sz = simd_oprsz(desc);
3095     uint64_t *d = vd, *n = vn;
3096 
3097     for (i = 0; i < opr_sz / 8; ++i) {
3098         d[i] = revbit64(bswap64(n[i]));
3099     }
3100     clear_tail(d, opr_sz, simd_maxsz(desc));
3101 }
3102 
3103 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3104 {
3105     intptr_t i, opr_sz = simd_oprsz(desc);
3106     uint32_t *d = vd, *n = vn;
3107 
3108     for (i = 0; i < opr_sz / 4; ++i) {
3109         d[i] = helper_recpe_u32(n[i]);
3110     }
3111     clear_tail(d, opr_sz, simd_maxsz(desc));
3112 }
3113 
3114 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3115 {
3116     intptr_t i, opr_sz = simd_oprsz(desc);
3117     uint32_t *d = vd, *n = vn;
3118 
3119     for (i = 0; i < opr_sz / 4; ++i) {
3120         d[i] = helper_rsqrte_u32(n[i]);
3121     }
3122     clear_tail(d, opr_sz, simd_maxsz(desc));
3123 }
3124