xref: /qemu/target/arm/tcg/vec_helper.c (revision 35aae9d24c060f5de2cfb3511359818a41e383b1)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
883     uint32_t neg_imag = neg_real ^ 1;
884     uintptr_t i;
885 
886     /* Shift boolean to the sign bit so we can xor to negate.  */
887     neg_real <<= 15;
888     neg_imag <<= 15;
889 
890     for (i = 0; i < opr_sz / 2; i += 2) {
891         float16 e0 = n[H2(i)];
892         float16 e1 = m[H2(i + 1)] ^ neg_imag;
893         float16 e2 = n[H2(i + 1)];
894         float16 e3 = m[H2(i)] ^ neg_real;
895 
896         d[H2(i)] = float16_add(e0, e1, fpst);
897         d[H2(i + 1)] = float16_add(e2, e3, fpst);
898     }
899     clear_tail(d, opr_sz, simd_maxsz(desc));
900 }
901 
902 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
903                          float_status *fpst, uint32_t desc)
904 {
905     uintptr_t opr_sz = simd_oprsz(desc);
906     float32 *d = vd;
907     float32 *n = vn;
908     float32 *m = vm;
909     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
910     uint32_t neg_imag = neg_real ^ 1;
911     uintptr_t i;
912 
913     /* Shift boolean to the sign bit so we can xor to negate.  */
914     neg_real <<= 31;
915     neg_imag <<= 31;
916 
917     for (i = 0; i < opr_sz / 4; i += 2) {
918         float32 e0 = n[H4(i)];
919         float32 e1 = m[H4(i + 1)] ^ neg_imag;
920         float32 e2 = n[H4(i + 1)];
921         float32 e3 = m[H4(i)] ^ neg_real;
922 
923         d[H4(i)] = float32_add(e0, e1, fpst);
924         d[H4(i + 1)] = float32_add(e2, e3, fpst);
925     }
926     clear_tail(d, opr_sz, simd_maxsz(desc));
927 }
928 
929 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
930                          float_status *fpst, uint32_t desc)
931 {
932     uintptr_t opr_sz = simd_oprsz(desc);
933     float64 *d = vd;
934     float64 *n = vn;
935     float64 *m = vm;
936     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
937     uint64_t neg_imag = neg_real ^ 1;
938     uintptr_t i;
939 
940     /* Shift boolean to the sign bit so we can xor to negate.  */
941     neg_real <<= 63;
942     neg_imag <<= 63;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1] ^ neg_imag;
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i] ^ neg_real;
949 
950         d[i] = float64_add(e0, e1, fpst);
951         d[i + 1] = float64_add(e2, e3, fpst);
952     }
953     clear_tail(d, opr_sz, simd_maxsz(desc));
954 }
955 
956 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
957                          float_status *fpst, uint32_t desc)
958 {
959     uintptr_t opr_sz = simd_oprsz(desc);
960     float16 *d = vd, *n = vn, *m = vm, *a = va;
961     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
962     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
963     uint32_t neg_real = flip ^ neg_imag;
964     uintptr_t i;
965 
966     /* Shift boolean to the sign bit so we can xor to negate.  */
967     neg_real <<= 15;
968     neg_imag <<= 15;
969 
970     for (i = 0; i < opr_sz / 2; i += 2) {
971         float16 e2 = n[H2(i + flip)];
972         float16 e1 = m[H2(i + flip)] ^ neg_real;
973         float16 e4 = e2;
974         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
975 
976         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
977         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
978     }
979     clear_tail(d, opr_sz, simd_maxsz(desc));
980 }
981 
982 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
983                              float_status *fpst, uint32_t desc)
984 {
985     uintptr_t opr_sz = simd_oprsz(desc);
986     float16 *d = vd, *n = vn, *m = vm, *a = va;
987     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
988     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
989     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
990     uint32_t neg_real = flip ^ neg_imag;
991     intptr_t elements = opr_sz / sizeof(float16);
992     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
993     intptr_t i, j;
994 
995     /* Shift boolean to the sign bit so we can xor to negate.  */
996     neg_real <<= 15;
997     neg_imag <<= 15;
998 
999     for (i = 0; i < elements; i += eltspersegment) {
1000         float16 mr = m[H2(i + 2 * index + 0)];
1001         float16 mi = m[H2(i + 2 * index + 1)];
1002         float16 e1 = neg_real ^ (flip ? mi : mr);
1003         float16 e3 = neg_imag ^ (flip ? mr : mi);
1004 
1005         for (j = i; j < i + eltspersegment; j += 2) {
1006             float16 e2 = n[H2(j + flip)];
1007             float16 e4 = e2;
1008 
1009             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1010             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1011         }
1012     }
1013     clear_tail(d, opr_sz, simd_maxsz(desc));
1014 }
1015 
1016 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1017                          float_status *fpst, uint32_t desc)
1018 {
1019     uintptr_t opr_sz = simd_oprsz(desc);
1020     float32 *d = vd, *n = vn, *m = vm, *a = va;
1021     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1022     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1023     uint32_t neg_real = flip ^ neg_imag;
1024     uintptr_t i;
1025 
1026     /* Shift boolean to the sign bit so we can xor to negate.  */
1027     neg_real <<= 31;
1028     neg_imag <<= 31;
1029 
1030     for (i = 0; i < opr_sz / 4; i += 2) {
1031         float32 e2 = n[H4(i + flip)];
1032         float32 e1 = m[H4(i + flip)] ^ neg_real;
1033         float32 e4 = e2;
1034         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1035 
1036         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1037         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1038     }
1039     clear_tail(d, opr_sz, simd_maxsz(desc));
1040 }
1041 
1042 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1043                              float_status *fpst, uint32_t desc)
1044 {
1045     uintptr_t opr_sz = simd_oprsz(desc);
1046     float32 *d = vd, *n = vn, *m = vm, *a = va;
1047     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1048     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1049     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1050     uint32_t neg_real = flip ^ neg_imag;
1051     intptr_t elements = opr_sz / sizeof(float32);
1052     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1053     intptr_t i, j;
1054 
1055     /* Shift boolean to the sign bit so we can xor to negate.  */
1056     neg_real <<= 31;
1057     neg_imag <<= 31;
1058 
1059     for (i = 0; i < elements; i += eltspersegment) {
1060         float32 mr = m[H4(i + 2 * index + 0)];
1061         float32 mi = m[H4(i + 2 * index + 1)];
1062         float32 e1 = neg_real ^ (flip ? mi : mr);
1063         float32 e3 = neg_imag ^ (flip ? mr : mi);
1064 
1065         for (j = i; j < i + eltspersegment; j += 2) {
1066             float32 e2 = n[H4(j + flip)];
1067             float32 e4 = e2;
1068 
1069             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1070             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1071         }
1072     }
1073     clear_tail(d, opr_sz, simd_maxsz(desc));
1074 }
1075 
1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1077                          float_status *fpst, uint32_t desc)
1078 {
1079     uintptr_t opr_sz = simd_oprsz(desc);
1080     float64 *d = vd, *n = vn, *m = vm, *a = va;
1081     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1082     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1083     uint64_t neg_real = flip ^ neg_imag;
1084     uintptr_t i;
1085 
1086     /* Shift boolean to the sign bit so we can xor to negate.  */
1087     neg_real <<= 63;
1088     neg_imag <<= 63;
1089 
1090     for (i = 0; i < opr_sz / 8; i += 2) {
1091         float64 e2 = n[i + flip];
1092         float64 e1 = m[i + flip] ^ neg_real;
1093         float64 e4 = e2;
1094         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1095 
1096         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1097         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1098     }
1099     clear_tail(d, opr_sz, simd_maxsz(desc));
1100 }
1101 
1102 /*
1103  * Floating point comparisons producing an integer result (all 1s or all 0s).
1104  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1105  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1106  */
1107 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1108 {
1109     return -float16_eq_quiet(op1, op2, stat);
1110 }
1111 
1112 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1113 {
1114     return -float32_eq_quiet(op1, op2, stat);
1115 }
1116 
1117 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1118 {
1119     return -float64_eq_quiet(op1, op2, stat);
1120 }
1121 
1122 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1123 {
1124     return -float16_le(op2, op1, stat);
1125 }
1126 
1127 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1128 {
1129     return -float32_le(op2, op1, stat);
1130 }
1131 
1132 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1133 {
1134     return -float64_le(op2, op1, stat);
1135 }
1136 
1137 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1138 {
1139     return -float16_lt(op2, op1, stat);
1140 }
1141 
1142 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1143 {
1144     return -float32_lt(op2, op1, stat);
1145 }
1146 
1147 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1148 {
1149     return -float64_lt(op2, op1, stat);
1150 }
1151 
1152 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1153 {
1154     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1155 }
1156 
1157 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1158 {
1159     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1160 }
1161 
1162 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1163 {
1164     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1165 }
1166 
1167 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1168 {
1169     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1170 }
1171 
1172 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1173 {
1174     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1175 }
1176 
1177 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1178 {
1179     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1180 }
1181 
1182 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1183 {
1184     if (float16_is_any_nan(x)) {
1185         float_raise(float_flag_invalid, fpst);
1186         return 0;
1187     }
1188     return float16_to_int16_round_to_zero(x, fpst);
1189 }
1190 
1191 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1192 {
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_uint16_round_to_zero(x, fpst);
1198 }
1199 
1200 #define DO_2OP(NAME, FUNC, TYPE) \
1201 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1202 {                                                                 \
1203     intptr_t i, oprsz = simd_oprsz(desc);                         \
1204     TYPE *d = vd, *n = vn;                                        \
1205     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1206         d[i] = FUNC(n[i], stat);                                  \
1207     }                                                             \
1208     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1209 }
1210 
1211 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1212 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1213 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1214 
1215 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1216 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1217 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1218 
1219 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1220 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1221 
1222 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1223 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1224 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1225 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1226 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1227 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1228 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1229 DO_2OP(gvec_touszh, vfp_touszh, float16)
1230 
1231 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1232     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1233     {                                                           \
1234         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1235     }
1236 
1237 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1238     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1239     {                                                           \
1240         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1241     }
1242 
1243 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1244     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1245     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1246     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1247     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1248     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1249     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1250 
1251 DO_2OP_CMP0(cgt, cgt, FWD)
1252 DO_2OP_CMP0(cge, cge, FWD)
1253 DO_2OP_CMP0(ceq, ceq, FWD)
1254 DO_2OP_CMP0(clt, cgt, REV)
1255 DO_2OP_CMP0(cle, cge, REV)
1256 
1257 #undef DO_2OP
1258 #undef DO_2OP_CMP0
1259 
1260 /* Floating-point trigonometric starting value.
1261  * See the ARM ARM pseudocode function FPTrigSMul.
1262  */
1263 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1264 {
1265     float16 result = float16_mul(op1, op1, stat);
1266     if (!float16_is_any_nan(result)) {
1267         result = float16_set_sign(result, op2 & 1);
1268     }
1269     return result;
1270 }
1271 
1272 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1273 {
1274     float32 result = float32_mul(op1, op1, stat);
1275     if (!float32_is_any_nan(result)) {
1276         result = float32_set_sign(result, op2 & 1);
1277     }
1278     return result;
1279 }
1280 
1281 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1282 {
1283     float64 result = float64_mul(op1, op1, stat);
1284     if (!float64_is_any_nan(result)) {
1285         result = float64_set_sign(result, op2 & 1);
1286     }
1287     return result;
1288 }
1289 
1290 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1291 {
1292     return float16_abs(float16_sub(op1, op2, stat));
1293 }
1294 
1295 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1296 {
1297     return float32_abs(float32_sub(op1, op2, stat));
1298 }
1299 
1300 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1301 {
1302     return float64_abs(float64_sub(op1, op2, stat));
1303 }
1304 
1305 /*
1306  * Reciprocal step. These are the AArch32 version which uses a
1307  * non-fused multiply-and-subtract.
1308  */
1309 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1310 {
1311     op1 = float16_squash_input_denormal(op1, stat);
1312     op2 = float16_squash_input_denormal(op2, stat);
1313 
1314     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1315         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1316         return float16_two;
1317     }
1318     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1319 }
1320 
1321 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1322 {
1323     op1 = float32_squash_input_denormal(op1, stat);
1324     op2 = float32_squash_input_denormal(op2, stat);
1325 
1326     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1327         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1328         return float32_two;
1329     }
1330     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1331 }
1332 
1333 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1334 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1335 {
1336     op1 = float16_squash_input_denormal(op1, stat);
1337     op2 = float16_squash_input_denormal(op2, stat);
1338 
1339     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1340         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1341         return float16_one_point_five;
1342     }
1343     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1344     return float16_div(op1, float16_two, stat);
1345 }
1346 
1347 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1348 {
1349     op1 = float32_squash_input_denormal(op1, stat);
1350     op2 = float32_squash_input_denormal(op2, stat);
1351 
1352     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1353         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1354         return float32_one_point_five;
1355     }
1356     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1357     return float32_div(op1, float32_two, stat);
1358 }
1359 
1360 #define DO_3OP(NAME, FUNC, TYPE) \
1361 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1362                   float_status *stat, uint32_t desc)                       \
1363 {                                                                          \
1364     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1365     TYPE *d = vd, *n = vn, *m = vm;                                        \
1366     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1367         d[i] = FUNC(n[i], m[i], stat);                                     \
1368     }                                                                      \
1369     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1370 }
1371 
1372 DO_3OP(gvec_fadd_h, float16_add, float16)
1373 DO_3OP(gvec_fadd_s, float32_add, float32)
1374 DO_3OP(gvec_fadd_d, float64_add, float64)
1375 
1376 DO_3OP(gvec_fsub_h, float16_sub, float16)
1377 DO_3OP(gvec_fsub_s, float32_sub, float32)
1378 DO_3OP(gvec_fsub_d, float64_sub, float64)
1379 
1380 DO_3OP(gvec_fmul_h, float16_mul, float16)
1381 DO_3OP(gvec_fmul_s, float32_mul, float32)
1382 DO_3OP(gvec_fmul_d, float64_mul, float64)
1383 
1384 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1385 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1386 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1387 
1388 DO_3OP(gvec_fabd_h, float16_abd, float16)
1389 DO_3OP(gvec_fabd_s, float32_abd, float32)
1390 DO_3OP(gvec_fabd_d, float64_abd, float64)
1391 
1392 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1393 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1394 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1395 
1396 DO_3OP(gvec_fcge_h, float16_cge, float16)
1397 DO_3OP(gvec_fcge_s, float32_cge, float32)
1398 DO_3OP(gvec_fcge_d, float64_cge, float64)
1399 
1400 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1401 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1402 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1403 
1404 DO_3OP(gvec_facge_h, float16_acge, float16)
1405 DO_3OP(gvec_facge_s, float32_acge, float32)
1406 DO_3OP(gvec_facge_d, float64_acge, float64)
1407 
1408 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1409 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1410 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1411 
1412 DO_3OP(gvec_fmax_h, float16_max, float16)
1413 DO_3OP(gvec_fmax_s, float32_max, float32)
1414 DO_3OP(gvec_fmax_d, float64_max, float64)
1415 
1416 DO_3OP(gvec_fmin_h, float16_min, float16)
1417 DO_3OP(gvec_fmin_s, float32_min, float32)
1418 DO_3OP(gvec_fmin_d, float64_min, float64)
1419 
1420 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1421 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1422 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1423 
1424 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1425 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1426 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1427 
1428 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1429 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1430 
1431 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1432 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1433 
1434 #ifdef TARGET_AARCH64
1435 DO_3OP(gvec_fdiv_h, float16_div, float16)
1436 DO_3OP(gvec_fdiv_s, float32_div, float32)
1437 DO_3OP(gvec_fdiv_d, float64_div, float64)
1438 
1439 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1440 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1441 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1442 
1443 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1444 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1445 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1446 
1447 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1448 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1449 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1450 
1451 #endif
1452 #undef DO_3OP
1453 
1454 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1455 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1456                                  float_status *stat)
1457 {
1458     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1459 }
1460 
1461 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1462                                  float_status *stat)
1463 {
1464     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1465 }
1466 
1467 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1468                                  float_status *stat)
1469 {
1470     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1471 }
1472 
1473 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1474                                  float_status *stat)
1475 {
1476     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1477 }
1478 
1479 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1480 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1481                                 float_status *stat)
1482 {
1483     return float16_muladd(op1, op2, dest, 0, stat);
1484 }
1485 
1486 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1487                                  float_status *stat)
1488 {
1489     return float32_muladd(op1, op2, dest, 0, stat);
1490 }
1491 
1492 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1493                                  float_status *stat)
1494 {
1495     return float64_muladd(op1, op2, dest, 0, stat);
1496 }
1497 
1498 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1499                                  float_status *stat)
1500 {
1501     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1502 }
1503 
1504 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1505                                  float_status *stat)
1506 {
1507     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1508 }
1509 
1510 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1511                                  float_status *stat)
1512 {
1513     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1514 }
1515 
1516 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1517 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1518                   float_status *stat, uint32_t desc)                       \
1519 {                                                                          \
1520     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1521     TYPE *d = vd, *n = vn, *m = vm;                                        \
1522     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1523         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1524     }                                                                      \
1525     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1526 }
1527 
1528 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1529 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1530 
1531 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1532 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1533 
1534 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1535 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1536 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1537 
1538 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1539 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1540 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1541 
1542 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1543  * For AdvSIMD, there is of course only one such vector segment.
1544  */
1545 
1546 #define DO_MUL_IDX(NAME, TYPE, H) \
1547 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1548 {                                                                          \
1549     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1550     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1551     intptr_t idx = simd_data(desc);                                        \
1552     TYPE *d = vd, *n = vn, *m = vm;                                        \
1553     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1554         TYPE mm = m[H(i + idx)];                                           \
1555         for (j = 0; j < segment; j++) {                                    \
1556             d[i + j] = n[i + j] * mm;                                      \
1557         }                                                                  \
1558     }                                                                      \
1559     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1560 }
1561 
1562 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1563 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1564 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1565 
1566 #undef DO_MUL_IDX
1567 
1568 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1569 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1570 {                                                                          \
1571     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1572     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1573     intptr_t idx = simd_data(desc);                                        \
1574     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1575     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1576         TYPE mm = m[H(i + idx)];                                           \
1577         for (j = 0; j < segment; j++) {                                    \
1578             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1579         }                                                                  \
1580     }                                                                      \
1581     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1582 }
1583 
1584 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1585 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1586 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1587 
1588 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1589 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1590 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1591 
1592 #undef DO_MLA_IDX
1593 
1594 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1595 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1596                   float_status *stat, uint32_t desc)                       \
1597 {                                                                          \
1598     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1599     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1600     intptr_t idx = simd_data(desc);                                        \
1601     TYPE *d = vd, *n = vn, *m = vm;                                        \
1602     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1603         TYPE mm = m[H(i + idx)];                                           \
1604         for (j = 0; j < segment; j++) {                                    \
1605             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1606         }                                                                  \
1607     }                                                                      \
1608     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1609 }
1610 
1611 #define nop(N, M, S) (M)
1612 
1613 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1614 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1615 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1616 
1617 #ifdef TARGET_AARCH64
1618 
1619 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1620 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1621 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1622 
1623 #endif
1624 
1625 #undef nop
1626 
1627 /*
1628  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1629  * the fused ops below they assume accumulate both from and into Vd.
1630  */
1631 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1632 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1633 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1634 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1635 
1636 #undef DO_FMUL_IDX
1637 
1638 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1639 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1640                   float_status *stat, uint32_t desc)                       \
1641 {                                                                          \
1642     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1643     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1644     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1645     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1646     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1647     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1648     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1649         TYPE mm = m[H(i + idx)];                                           \
1650         for (j = 0; j < segment; j++) {                                    \
1651             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1652                                      mm, a[i + j], 0, stat);               \
1653         }                                                                  \
1654     }                                                                      \
1655     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1656 }
1657 
1658 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1659 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1660 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1661 
1662 #undef DO_FMLA_IDX
1663 
1664 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1665 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1666 {                                                                          \
1667     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1668     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1669     bool q = false;                                                        \
1670     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1671         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1672         if (dd < MIN) {                                                    \
1673             dd = MIN;                                                      \
1674             q = true;                                                      \
1675         } else if (dd > MAX) {                                             \
1676             dd = MAX;                                                      \
1677             q = true;                                                      \
1678         }                                                                  \
1679         d[i] = dd;                                                         \
1680     }                                                                      \
1681     if (q) {                                                               \
1682         uint32_t *qc = vq;                                                 \
1683         qc[0] = 1;                                                         \
1684     }                                                                      \
1685     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1686 }
1687 
1688 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1689 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1690 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1691 
1692 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1693 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1694 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1695 
1696 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1697 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1698 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1699 
1700 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1701 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1702 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1703 
1704 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1705 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1706 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1707 
1708 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1709 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1710 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1711 
1712 #undef DO_SAT
1713 
1714 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1715                           void *vm, uint32_t desc)
1716 {
1717     intptr_t i, oprsz = simd_oprsz(desc);
1718     uint64_t *d = vd, *n = vn, *m = vm;
1719     bool q = false;
1720 
1721     for (i = 0; i < oprsz / 8; i++) {
1722         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1723         if (dd < nn) {
1724             dd = UINT64_MAX;
1725             q = true;
1726         }
1727         d[i] = dd;
1728     }
1729     if (q) {
1730         uint32_t *qc = vq;
1731         qc[0] = 1;
1732     }
1733     clear_tail(d, oprsz, simd_maxsz(desc));
1734 }
1735 
1736 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1737                           void *vm, uint32_t desc)
1738 {
1739     intptr_t i, oprsz = simd_oprsz(desc);
1740     uint64_t *d = vd, *n = vn, *m = vm;
1741     bool q = false;
1742 
1743     for (i = 0; i < oprsz / 8; i++) {
1744         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1745         if (nn < mm) {
1746             dd = 0;
1747             q = true;
1748         }
1749         d[i] = dd;
1750     }
1751     if (q) {
1752         uint32_t *qc = vq;
1753         qc[0] = 1;
1754     }
1755     clear_tail(d, oprsz, simd_maxsz(desc));
1756 }
1757 
1758 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1759                           void *vm, uint32_t desc)
1760 {
1761     intptr_t i, oprsz = simd_oprsz(desc);
1762     int64_t *d = vd, *n = vn, *m = vm;
1763     bool q = false;
1764 
1765     for (i = 0; i < oprsz / 8; i++) {
1766         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1767         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1768             dd = (nn >> 63) ^ ~INT64_MIN;
1769             q = true;
1770         }
1771         d[i] = dd;
1772     }
1773     if (q) {
1774         uint32_t *qc = vq;
1775         qc[0] = 1;
1776     }
1777     clear_tail(d, oprsz, simd_maxsz(desc));
1778 }
1779 
1780 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1781                           void *vm, uint32_t desc)
1782 {
1783     intptr_t i, oprsz = simd_oprsz(desc);
1784     int64_t *d = vd, *n = vn, *m = vm;
1785     bool q = false;
1786 
1787     for (i = 0; i < oprsz / 8; i++) {
1788         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1789         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1790             dd = (nn >> 63) ^ ~INT64_MIN;
1791             q = true;
1792         }
1793         d[i] = dd;
1794     }
1795     if (q) {
1796         uint32_t *qc = vq;
1797         qc[0] = 1;
1798     }
1799     clear_tail(d, oprsz, simd_maxsz(desc));
1800 }
1801 
1802 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1803                            void *vm, uint32_t desc)
1804 {
1805     intptr_t i, oprsz = simd_oprsz(desc);
1806     uint64_t *d = vd, *n = vn, *m = vm;
1807     bool q = false;
1808 
1809     for (i = 0; i < oprsz / 8; i++) {
1810         uint64_t nn = n[i];
1811         int64_t mm = m[i];
1812         uint64_t dd = nn + mm;
1813 
1814         if (mm < 0) {
1815             if (nn < (uint64_t)-mm) {
1816                 dd = 0;
1817                 q = true;
1818             }
1819         } else {
1820             if (dd < nn) {
1821                 dd = UINT64_MAX;
1822                 q = true;
1823             }
1824         }
1825         d[i] = dd;
1826     }
1827     if (q) {
1828         uint32_t *qc = vq;
1829         qc[0] = 1;
1830     }
1831     clear_tail(d, oprsz, simd_maxsz(desc));
1832 }
1833 
1834 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1835                            void *vm, uint32_t desc)
1836 {
1837     intptr_t i, oprsz = simd_oprsz(desc);
1838     uint64_t *d = vd, *n = vn, *m = vm;
1839     bool q = false;
1840 
1841     for (i = 0; i < oprsz / 8; i++) {
1842         int64_t nn = n[i];
1843         uint64_t mm = m[i];
1844         int64_t dd = nn + mm;
1845 
1846         if (mm > (uint64_t)(INT64_MAX - nn)) {
1847             dd = INT64_MAX;
1848             q = true;
1849         }
1850         d[i] = dd;
1851     }
1852     if (q) {
1853         uint32_t *qc = vq;
1854         qc[0] = 1;
1855     }
1856     clear_tail(d, oprsz, simd_maxsz(desc));
1857 }
1858 
1859 #define DO_SRA(NAME, TYPE)                              \
1860 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1861 {                                                       \
1862     intptr_t i, oprsz = simd_oprsz(desc);               \
1863     int shift = simd_data(desc);                        \
1864     TYPE *d = vd, *n = vn;                              \
1865     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1866         d[i] += n[i] >> shift;                          \
1867     }                                                   \
1868     clear_tail(d, oprsz, simd_maxsz(desc));             \
1869 }
1870 
1871 DO_SRA(gvec_ssra_b, int8_t)
1872 DO_SRA(gvec_ssra_h, int16_t)
1873 DO_SRA(gvec_ssra_s, int32_t)
1874 DO_SRA(gvec_ssra_d, int64_t)
1875 
1876 DO_SRA(gvec_usra_b, uint8_t)
1877 DO_SRA(gvec_usra_h, uint16_t)
1878 DO_SRA(gvec_usra_s, uint32_t)
1879 DO_SRA(gvec_usra_d, uint64_t)
1880 
1881 #undef DO_SRA
1882 
1883 #define DO_RSHR(NAME, TYPE)                             \
1884 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1885 {                                                       \
1886     intptr_t i, oprsz = simd_oprsz(desc);               \
1887     int shift = simd_data(desc);                        \
1888     TYPE *d = vd, *n = vn;                              \
1889     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1890         TYPE tmp = n[i] >> (shift - 1);                 \
1891         d[i] = (tmp >> 1) + (tmp & 1);                  \
1892     }                                                   \
1893     clear_tail(d, oprsz, simd_maxsz(desc));             \
1894 }
1895 
1896 DO_RSHR(gvec_srshr_b, int8_t)
1897 DO_RSHR(gvec_srshr_h, int16_t)
1898 DO_RSHR(gvec_srshr_s, int32_t)
1899 DO_RSHR(gvec_srshr_d, int64_t)
1900 
1901 DO_RSHR(gvec_urshr_b, uint8_t)
1902 DO_RSHR(gvec_urshr_h, uint16_t)
1903 DO_RSHR(gvec_urshr_s, uint32_t)
1904 DO_RSHR(gvec_urshr_d, uint64_t)
1905 
1906 #undef DO_RSHR
1907 
1908 #define DO_RSRA(NAME, TYPE)                             \
1909 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1910 {                                                       \
1911     intptr_t i, oprsz = simd_oprsz(desc);               \
1912     int shift = simd_data(desc);                        \
1913     TYPE *d = vd, *n = vn;                              \
1914     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1915         TYPE tmp = n[i] >> (shift - 1);                 \
1916         d[i] += (tmp >> 1) + (tmp & 1);                 \
1917     }                                                   \
1918     clear_tail(d, oprsz, simd_maxsz(desc));             \
1919 }
1920 
1921 DO_RSRA(gvec_srsra_b, int8_t)
1922 DO_RSRA(gvec_srsra_h, int16_t)
1923 DO_RSRA(gvec_srsra_s, int32_t)
1924 DO_RSRA(gvec_srsra_d, int64_t)
1925 
1926 DO_RSRA(gvec_ursra_b, uint8_t)
1927 DO_RSRA(gvec_ursra_h, uint16_t)
1928 DO_RSRA(gvec_ursra_s, uint32_t)
1929 DO_RSRA(gvec_ursra_d, uint64_t)
1930 
1931 #undef DO_RSRA
1932 
1933 #define DO_SRI(NAME, TYPE)                              \
1934 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1935 {                                                       \
1936     intptr_t i, oprsz = simd_oprsz(desc);               \
1937     int shift = simd_data(desc);                        \
1938     TYPE *d = vd, *n = vn;                              \
1939     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1940         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1941     }                                                   \
1942     clear_tail(d, oprsz, simd_maxsz(desc));             \
1943 }
1944 
1945 DO_SRI(gvec_sri_b, uint8_t)
1946 DO_SRI(gvec_sri_h, uint16_t)
1947 DO_SRI(gvec_sri_s, uint32_t)
1948 DO_SRI(gvec_sri_d, uint64_t)
1949 
1950 #undef DO_SRI
1951 
1952 #define DO_SLI(NAME, TYPE)                              \
1953 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1954 {                                                       \
1955     intptr_t i, oprsz = simd_oprsz(desc);               \
1956     int shift = simd_data(desc);                        \
1957     TYPE *d = vd, *n = vn;                              \
1958     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1959         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1960     }                                                   \
1961     clear_tail(d, oprsz, simd_maxsz(desc));             \
1962 }
1963 
1964 DO_SLI(gvec_sli_b, uint8_t)
1965 DO_SLI(gvec_sli_h, uint16_t)
1966 DO_SLI(gvec_sli_s, uint32_t)
1967 DO_SLI(gvec_sli_d, uint64_t)
1968 
1969 #undef DO_SLI
1970 
1971 /*
1972  * Convert float16 to float32, raising no exceptions and
1973  * preserving exceptional values, including SNaN.
1974  * This is effectively an unpack+repack operation.
1975  */
1976 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1977 {
1978     const int f16_bias = 15;
1979     const int f32_bias = 127;
1980     uint32_t sign = extract32(f16, 15, 1);
1981     uint32_t exp = extract32(f16, 10, 5);
1982     uint32_t frac = extract32(f16, 0, 10);
1983 
1984     if (exp == 0x1f) {
1985         /* Inf or NaN */
1986         exp = 0xff;
1987     } else if (exp == 0) {
1988         /* Zero or denormal.  */
1989         if (frac != 0) {
1990             if (fz16) {
1991                 frac = 0;
1992             } else {
1993                 /*
1994                  * Denormal; these are all normal float32.
1995                  * Shift the fraction so that the msb is at bit 11,
1996                  * then remove bit 11 as the implicit bit of the
1997                  * normalized float32.  Note that we still go through
1998                  * the shift for normal numbers below, to put the
1999                  * float32 fraction at the right place.
2000                  */
2001                 int shift = clz32(frac) - 21;
2002                 frac = (frac << shift) & 0x3ff;
2003                 exp = f32_bias - f16_bias - shift + 1;
2004             }
2005         }
2006     } else {
2007         /* Normal number; adjust the bias.  */
2008         exp += f32_bias - f16_bias;
2009     }
2010     sign <<= 31;
2011     exp <<= 23;
2012     frac <<= 23 - 10;
2013 
2014     return sign | exp | frac;
2015 }
2016 
2017 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2018 {
2019     /*
2020      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2021      * Load the 2nd qword iff is_q & is_2.
2022      * Shift to the 2nd dword iff !is_q & is_2.
2023      * For !is_q & !is_2, the upper bits of the result are garbage.
2024      */
2025     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2026 }
2027 
2028 /*
2029  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2030  * as there is not yet SVE versions that might use blocking.
2031  */
2032 
2033 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2034                      uint32_t desc, bool fz16)
2035 {
2036     intptr_t i, oprsz = simd_oprsz(desc);
2037     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2038     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2039     int is_q = oprsz == 16;
2040     uint64_t n_4, m_4;
2041 
2042     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2043     n_4 = load4_f16(vn, is_q, is_2);
2044     m_4 = load4_f16(vm, is_q, is_2);
2045 
2046     /* Negate all inputs for FMLSL at once.  */
2047     if (is_s) {
2048         n_4 ^= 0x8000800080008000ull;
2049     }
2050 
2051     for (i = 0; i < oprsz / 4; i++) {
2052         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2053         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2054         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2055     }
2056     clear_tail(d, oprsz, simd_maxsz(desc));
2057 }
2058 
2059 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2060                             CPUARMState *env, uint32_t desc)
2061 {
2062     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2063              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2064 }
2065 
2066 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2067                             CPUARMState *env, uint32_t desc)
2068 {
2069     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2070              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2071 }
2072 
2073 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2074                                CPUARMState *env, uint32_t desc)
2075 {
2076     intptr_t i, oprsz = simd_oprsz(desc);
2077     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2078     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2079     float_status *status = &env->vfp.fp_status_a64;
2080     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2081 
2082     for (i = 0; i < oprsz; i += sizeof(float32)) {
2083         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2084         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2085         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2086         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2087         float32 aa = *(float32 *)(va + H1_4(i));
2088 
2089         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2090     }
2091 }
2092 
2093 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2094                          uint32_t desc, bool fz16)
2095 {
2096     intptr_t i, oprsz = simd_oprsz(desc);
2097     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2098     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2099     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2100     int is_q = oprsz == 16;
2101     uint64_t n_4;
2102     float32 m_1;
2103 
2104     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2105     n_4 = load4_f16(vn, is_q, is_2);
2106 
2107     /* Negate all inputs for FMLSL at once.  */
2108     if (is_s) {
2109         n_4 ^= 0x8000800080008000ull;
2110     }
2111 
2112     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2113 
2114     for (i = 0; i < oprsz / 4; i++) {
2115         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2116         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2117     }
2118     clear_tail(d, oprsz, simd_maxsz(desc));
2119 }
2120 
2121 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2122                                 CPUARMState *env, uint32_t desc)
2123 {
2124     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2125                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2126 }
2127 
2128 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2129                                 CPUARMState *env, uint32_t desc)
2130 {
2131     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2132                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2133 }
2134 
2135 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2136                                CPUARMState *env, uint32_t desc)
2137 {
2138     intptr_t i, j, oprsz = simd_oprsz(desc);
2139     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2140     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2141     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2142     float_status *status = &env->vfp.fp_status_a64;
2143     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2144 
2145     for (i = 0; i < oprsz; i += 16) {
2146         float16 mm_16 = *(float16 *)(vm + i + idx);
2147         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2148 
2149         for (j = 0; j < 16; j += sizeof(float32)) {
2150             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2151             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2152             float32 aa = *(float32 *)(va + H1_4(i + j));
2153 
2154             *(float32 *)(vd + H1_4(i + j)) =
2155                 float32_muladd(nn, mm, aa, 0, status);
2156         }
2157     }
2158 }
2159 
2160 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2161 {
2162     intptr_t i, opr_sz = simd_oprsz(desc);
2163     int8_t *d = vd, *n = vn, *m = vm;
2164 
2165     for (i = 0; i < opr_sz; ++i) {
2166         int8_t mm = m[i];
2167         int8_t nn = n[i];
2168         int8_t res = 0;
2169         if (mm >= 0) {
2170             if (mm < 8) {
2171                 res = nn << mm;
2172             }
2173         } else {
2174             res = nn >> (mm > -8 ? -mm : 7);
2175         }
2176         d[i] = res;
2177     }
2178     clear_tail(d, opr_sz, simd_maxsz(desc));
2179 }
2180 
2181 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2182 {
2183     intptr_t i, opr_sz = simd_oprsz(desc);
2184     int16_t *d = vd, *n = vn, *m = vm;
2185 
2186     for (i = 0; i < opr_sz / 2; ++i) {
2187         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2188         int16_t nn = n[i];
2189         int16_t res = 0;
2190         if (mm >= 0) {
2191             if (mm < 16) {
2192                 res = nn << mm;
2193             }
2194         } else {
2195             res = nn >> (mm > -16 ? -mm : 15);
2196         }
2197         d[i] = res;
2198     }
2199     clear_tail(d, opr_sz, simd_maxsz(desc));
2200 }
2201 
2202 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2203 {
2204     intptr_t i, opr_sz = simd_oprsz(desc);
2205     uint8_t *d = vd, *n = vn, *m = vm;
2206 
2207     for (i = 0; i < opr_sz; ++i) {
2208         int8_t mm = m[i];
2209         uint8_t nn = n[i];
2210         uint8_t res = 0;
2211         if (mm >= 0) {
2212             if (mm < 8) {
2213                 res = nn << mm;
2214             }
2215         } else {
2216             if (mm > -8) {
2217                 res = nn >> -mm;
2218             }
2219         }
2220         d[i] = res;
2221     }
2222     clear_tail(d, opr_sz, simd_maxsz(desc));
2223 }
2224 
2225 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2226 {
2227     intptr_t i, opr_sz = simd_oprsz(desc);
2228     uint16_t *d = vd, *n = vn, *m = vm;
2229 
2230     for (i = 0; i < opr_sz / 2; ++i) {
2231         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2232         uint16_t nn = n[i];
2233         uint16_t res = 0;
2234         if (mm >= 0) {
2235             if (mm < 16) {
2236                 res = nn << mm;
2237             }
2238         } else {
2239             if (mm > -16) {
2240                 res = nn >> -mm;
2241             }
2242         }
2243         d[i] = res;
2244     }
2245     clear_tail(d, opr_sz, simd_maxsz(desc));
2246 }
2247 
2248 /*
2249  * 8x8->8 polynomial multiply.
2250  *
2251  * Polynomial multiplication is like integer multiplication except the
2252  * partial products are XORed, not added.
2253  *
2254  * TODO: expose this as a generic vector operation, as it is a common
2255  * crypto building block.
2256  */
2257 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2258 {
2259     intptr_t i, opr_sz = simd_oprsz(desc);
2260     uint64_t *d = vd, *n = vn, *m = vm;
2261 
2262     for (i = 0; i < opr_sz / 8; ++i) {
2263         d[i] = clmul_8x8_low(n[i], m[i]);
2264     }
2265     clear_tail(d, opr_sz, simd_maxsz(desc));
2266 }
2267 
2268 /*
2269  * 64x64->128 polynomial multiply.
2270  * Because of the lanes are not accessed in strict columns,
2271  * this probably cannot be turned into a generic helper.
2272  */
2273 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2274 {
2275     intptr_t i, opr_sz = simd_oprsz(desc);
2276     intptr_t hi = simd_data(desc);
2277     uint64_t *d = vd, *n = vn, *m = vm;
2278 
2279     for (i = 0; i < opr_sz / 8; i += 2) {
2280         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2281         d[i] = int128_getlo(r);
2282         d[i + 1] = int128_gethi(r);
2283     }
2284     clear_tail(d, opr_sz, simd_maxsz(desc));
2285 }
2286 
2287 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2288 {
2289     int hi = simd_data(desc);
2290     uint64_t *d = vd, *n = vn, *m = vm;
2291     uint64_t nn = n[hi], mm = m[hi];
2292 
2293     d[0] = clmul_8x4_packed(nn, mm);
2294     nn >>= 32;
2295     mm >>= 32;
2296     d[1] = clmul_8x4_packed(nn, mm);
2297 
2298     clear_tail(d, 16, simd_maxsz(desc));
2299 }
2300 
2301 #ifdef TARGET_AARCH64
2302 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2303 {
2304     int shift = simd_data(desc) * 8;
2305     intptr_t i, opr_sz = simd_oprsz(desc);
2306     uint64_t *d = vd, *n = vn, *m = vm;
2307 
2308     for (i = 0; i < opr_sz / 8; ++i) {
2309         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2310     }
2311 }
2312 
2313 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2314 {
2315     intptr_t sel = H4(simd_data(desc));
2316     intptr_t i, opr_sz = simd_oprsz(desc);
2317     uint32_t *n = vn, *m = vm;
2318     uint64_t *d = vd;
2319 
2320     for (i = 0; i < opr_sz / 8; ++i) {
2321         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2322     }
2323 }
2324 #endif
2325 
2326 #define DO_CMP0(NAME, TYPE, OP)                         \
2327 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2328 {                                                       \
2329     intptr_t i, opr_sz = simd_oprsz(desc);              \
2330     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2331         TYPE nn = *(TYPE *)(vn + i);                    \
2332         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2333     }                                                   \
2334     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2335 }
2336 
2337 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2338 DO_CMP0(gvec_clt0_b, int8_t, <)
2339 DO_CMP0(gvec_cle0_b, int8_t, <=)
2340 DO_CMP0(gvec_cgt0_b, int8_t, >)
2341 DO_CMP0(gvec_cge0_b, int8_t, >=)
2342 
2343 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2344 DO_CMP0(gvec_clt0_h, int16_t, <)
2345 DO_CMP0(gvec_cle0_h, int16_t, <=)
2346 DO_CMP0(gvec_cgt0_h, int16_t, >)
2347 DO_CMP0(gvec_cge0_h, int16_t, >=)
2348 
2349 #undef DO_CMP0
2350 
2351 #define DO_ABD(NAME, TYPE)                                      \
2352 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2353 {                                                               \
2354     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2355     TYPE *d = vd, *n = vn, *m = vm;                             \
2356                                                                 \
2357     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2358         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2359     }                                                           \
2360     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2361 }
2362 
2363 DO_ABD(gvec_sabd_b, int8_t)
2364 DO_ABD(gvec_sabd_h, int16_t)
2365 DO_ABD(gvec_sabd_s, int32_t)
2366 DO_ABD(gvec_sabd_d, int64_t)
2367 
2368 DO_ABD(gvec_uabd_b, uint8_t)
2369 DO_ABD(gvec_uabd_h, uint16_t)
2370 DO_ABD(gvec_uabd_s, uint32_t)
2371 DO_ABD(gvec_uabd_d, uint64_t)
2372 
2373 #undef DO_ABD
2374 
2375 #define DO_ABA(NAME, TYPE)                                      \
2376 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2377 {                                                               \
2378     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2379     TYPE *d = vd, *n = vn, *m = vm;                             \
2380                                                                 \
2381     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2382         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2383     }                                                           \
2384     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2385 }
2386 
2387 DO_ABA(gvec_saba_b, int8_t)
2388 DO_ABA(gvec_saba_h, int16_t)
2389 DO_ABA(gvec_saba_s, int32_t)
2390 DO_ABA(gvec_saba_d, int64_t)
2391 
2392 DO_ABA(gvec_uaba_b, uint8_t)
2393 DO_ABA(gvec_uaba_h, uint16_t)
2394 DO_ABA(gvec_uaba_s, uint32_t)
2395 DO_ABA(gvec_uaba_d, uint64_t)
2396 
2397 #undef DO_ABA
2398 
2399 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2400 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2401                   float_status *stat, uint32_t desc)                       \
2402 {                                                                          \
2403     ARMVectorReg scratch;                                                  \
2404     intptr_t oprsz = simd_oprsz(desc);                                     \
2405     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2406     TYPE *d = vd, *n = vn, *m = vm;                                        \
2407     if (unlikely(d == m)) {                                                \
2408         m = memcpy(&scratch, m, oprsz);                                    \
2409     }                                                                      \
2410     for (intptr_t i = 0; i < half; ++i) {                                  \
2411         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2412     }                                                                      \
2413     for (intptr_t i = 0; i < half; ++i) {                                  \
2414         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2415     }                                                                      \
2416     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2417 }
2418 
2419 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2420 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2421 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2422 
2423 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2424 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2425 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2426 
2427 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2428 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2429 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2430 
2431 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2432 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2433 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2434 
2435 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2436 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2437 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2438 
2439 #undef DO_3OP_PAIR
2440 
2441 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2442 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2443 {                                                               \
2444     ARMVectorReg scratch;                                       \
2445     intptr_t oprsz = simd_oprsz(desc);                          \
2446     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2447     TYPE *d = vd, *n = vn, *m = vm;                             \
2448     if (unlikely(d == m)) {                                     \
2449         m = memcpy(&scratch, m, oprsz);                         \
2450     }                                                           \
2451     for (intptr_t i = 0; i < half; ++i) {                       \
2452         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2453     }                                                           \
2454     for (intptr_t i = 0; i < half; ++i) {                       \
2455         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2456     }                                                           \
2457     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2458 }
2459 
2460 #define ADD(A, B) (A + B)
2461 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2462 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2463 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2464 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2465 #undef  ADD
2466 
2467 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2468 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2469 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2470 
2471 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2472 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2473 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2474 
2475 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2476 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2477 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2478 
2479 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2480 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2481 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2482 
2483 #undef DO_3OP_PAIR
2484 
2485 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2486     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2487     {                                                                   \
2488         intptr_t i, oprsz = simd_oprsz(desc);                           \
2489         int shift = simd_data(desc);                                    \
2490         TYPE *d = vd, *n = vn;                                          \
2491         float_status *fpst = stat;                                      \
2492         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2493             d[i] = FUNC(n[i], shift, fpst);                             \
2494         }                                                               \
2495         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2496     }
2497 
2498 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2499 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2500 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2501 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2502 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2503 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2504 
2505 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2506 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2507 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2508 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2509 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2510 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2511 
2512 #undef DO_VCVT_FIXED
2513 
2514 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2515     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2516     {                                                                   \
2517         intptr_t i, oprsz = simd_oprsz(desc);                           \
2518         uint32_t rmode = simd_data(desc);                               \
2519         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2520         TYPE *d = vd, *n = vn;                                          \
2521         set_float_rounding_mode(rmode, fpst);                           \
2522         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2523             d[i] = FUNC(n[i], 0, fpst);                                 \
2524         }                                                               \
2525         set_float_rounding_mode(prev_rmode, fpst);                      \
2526         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2527     }
2528 
2529 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2530 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2531 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2532 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2533 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2534 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2535 
2536 #undef DO_VCVT_RMODE
2537 
2538 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2539     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2540     {                                                                   \
2541         intptr_t i, oprsz = simd_oprsz(desc);                           \
2542         uint32_t rmode = simd_data(desc);                               \
2543         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2544         TYPE *d = vd, *n = vn;                                          \
2545         set_float_rounding_mode(rmode, fpst);                           \
2546         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2547             d[i] = FUNC(n[i], fpst);                                    \
2548         }                                                               \
2549         set_float_rounding_mode(prev_rmode, fpst);                      \
2550         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2551     }
2552 
2553 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2554 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2555 
2556 #undef DO_VRINT_RMODE
2557 
2558 #ifdef TARGET_AARCH64
2559 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2560 {
2561     const uint8_t *indices = vm;
2562     size_t oprsz = simd_oprsz(desc);
2563     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2564     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2565     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2566     union {
2567         uint8_t b[16];
2568         uint64_t d[2];
2569     } result;
2570 
2571     /*
2572      * We must construct the final result in a temp, lest the output
2573      * overlaps the input table.  For TBL, begin with zero; for TBX,
2574      * begin with the original register contents.  Note that we always
2575      * copy 16 bytes here to avoid an extra branch; clearing the high
2576      * bits of the register for oprsz == 8 is handled below.
2577      */
2578     if (is_tbx) {
2579         memcpy(&result, vd, 16);
2580     } else {
2581         memset(&result, 0, 16);
2582     }
2583 
2584     for (size_t i = 0; i < oprsz; ++i) {
2585         uint32_t index = indices[H1(i)];
2586 
2587         if (index < table_len) {
2588             /*
2589              * Convert index (a byte offset into the virtual table
2590              * which is a series of 128-bit vectors concatenated)
2591              * into the correct register element, bearing in mind
2592              * that the table can wrap around from V31 to V0.
2593              */
2594             const uint8_t *table = (const uint8_t *)
2595                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2596             result.b[H1(i)] = table[H1(index % 16)];
2597         }
2598     }
2599 
2600     memcpy(vd, &result, 16);
2601     clear_tail(vd, oprsz, simd_maxsz(desc));
2602 }
2603 #endif
2604 
2605 /*
2606  * NxN -> N highpart multiply
2607  *
2608  * TODO: expose this as a generic vector operation.
2609  */
2610 
2611 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2612 {
2613     intptr_t i, opr_sz = simd_oprsz(desc);
2614     int8_t *d = vd, *n = vn, *m = vm;
2615 
2616     for (i = 0; i < opr_sz; ++i) {
2617         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2618     }
2619     clear_tail(d, opr_sz, simd_maxsz(desc));
2620 }
2621 
2622 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2623 {
2624     intptr_t i, opr_sz = simd_oprsz(desc);
2625     int16_t *d = vd, *n = vn, *m = vm;
2626 
2627     for (i = 0; i < opr_sz / 2; ++i) {
2628         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2629     }
2630     clear_tail(d, opr_sz, simd_maxsz(desc));
2631 }
2632 
2633 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2634 {
2635     intptr_t i, opr_sz = simd_oprsz(desc);
2636     int32_t *d = vd, *n = vn, *m = vm;
2637 
2638     for (i = 0; i < opr_sz / 4; ++i) {
2639         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2640     }
2641     clear_tail(d, opr_sz, simd_maxsz(desc));
2642 }
2643 
2644 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2645 {
2646     intptr_t i, opr_sz = simd_oprsz(desc);
2647     uint64_t *d = vd, *n = vn, *m = vm;
2648     uint64_t discard;
2649 
2650     for (i = 0; i < opr_sz / 8; ++i) {
2651         muls64(&discard, &d[i], n[i], m[i]);
2652     }
2653     clear_tail(d, opr_sz, simd_maxsz(desc));
2654 }
2655 
2656 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2657 {
2658     intptr_t i, opr_sz = simd_oprsz(desc);
2659     uint8_t *d = vd, *n = vn, *m = vm;
2660 
2661     for (i = 0; i < opr_sz; ++i) {
2662         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2663     }
2664     clear_tail(d, opr_sz, simd_maxsz(desc));
2665 }
2666 
2667 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2668 {
2669     intptr_t i, opr_sz = simd_oprsz(desc);
2670     uint16_t *d = vd, *n = vn, *m = vm;
2671 
2672     for (i = 0; i < opr_sz / 2; ++i) {
2673         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2674     }
2675     clear_tail(d, opr_sz, simd_maxsz(desc));
2676 }
2677 
2678 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2679 {
2680     intptr_t i, opr_sz = simd_oprsz(desc);
2681     uint32_t *d = vd, *n = vn, *m = vm;
2682 
2683     for (i = 0; i < opr_sz / 4; ++i) {
2684         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2685     }
2686     clear_tail(d, opr_sz, simd_maxsz(desc));
2687 }
2688 
2689 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2690 {
2691     intptr_t i, opr_sz = simd_oprsz(desc);
2692     uint64_t *d = vd, *n = vn, *m = vm;
2693     uint64_t discard;
2694 
2695     for (i = 0; i < opr_sz / 8; ++i) {
2696         mulu64(&discard, &d[i], n[i], m[i]);
2697     }
2698     clear_tail(d, opr_sz, simd_maxsz(desc));
2699 }
2700 
2701 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2702 {
2703     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2704     int shr = simd_data(desc);
2705     uint64_t *d = vd, *n = vn, *m = vm;
2706 
2707     for (i = 0; i < opr_sz; ++i) {
2708         d[i] = ror64(n[i] ^ m[i], shr);
2709     }
2710     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2711 }
2712 
2713 /*
2714  * Integer matrix-multiply accumulate
2715  */
2716 
2717 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2718 {
2719     int8_t *n = vn, *m = vm;
2720 
2721     for (intptr_t k = 0; k < 8; ++k) {
2722         sum += n[H1(k)] * m[H1(k)];
2723     }
2724     return sum;
2725 }
2726 
2727 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2728 {
2729     uint8_t *n = vn, *m = vm;
2730 
2731     for (intptr_t k = 0; k < 8; ++k) {
2732         sum += n[H1(k)] * m[H1(k)];
2733     }
2734     return sum;
2735 }
2736 
2737 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2738 {
2739     uint8_t *n = vn;
2740     int8_t *m = vm;
2741 
2742     for (intptr_t k = 0; k < 8; ++k) {
2743         sum += n[H1(k)] * m[H1(k)];
2744     }
2745     return sum;
2746 }
2747 
2748 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2749                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2750 {
2751     intptr_t seg, opr_sz = simd_oprsz(desc);
2752 
2753     for (seg = 0; seg < opr_sz; seg += 16) {
2754         uint32_t *d = vd + seg;
2755         uint32_t *a = va + seg;
2756         uint32_t sum0, sum1, sum2, sum3;
2757 
2758         /*
2759          * Process the entire segment at once, writing back the
2760          * results only after we've consumed all of the inputs.
2761          *
2762          * Key to indices by column:
2763          *          i   j                  i             j
2764          */
2765         sum0 = a[H4(0 + 0)];
2766         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2767         sum1 = a[H4(0 + 1)];
2768         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2769         sum2 = a[H4(2 + 0)];
2770         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2771         sum3 = a[H4(2 + 1)];
2772         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2773 
2774         d[H4(0)] = sum0;
2775         d[H4(1)] = sum1;
2776         d[H4(2)] = sum2;
2777         d[H4(3)] = sum3;
2778     }
2779     clear_tail(vd, opr_sz, simd_maxsz(desc));
2780 }
2781 
2782 #define DO_MMLA_B(NAME, INNER) \
2783     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2784     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2785 
2786 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2787 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2788 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2789 
2790 /*
2791  * BFloat16 Dot Product
2792  */
2793 
2794 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2795 {
2796     /*
2797      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2798      * For EBF = 0, we ignore the FPCR bits which determine rounding
2799      * mode and denormal-flushing, and we do unfused multiplies and
2800      * additions with intermediate rounding of all products and sums.
2801      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2802      * and we perform a fused two-way sum-of-products without intermediate
2803      * rounding of the products.
2804      * In either case, we don't set fp exception flags.
2805      *
2806      * EBF is AArch64 only, so even if it's set in the FPCR it has
2807      * no effect on AArch32 instructions.
2808      */
2809     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2810 
2811     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2812     set_default_nan_mode(true, statusp);
2813 
2814     if (ebf) {
2815         /* EBF=1 needs to do a step with round-to-odd semantics */
2816         *oddstatusp = *statusp;
2817         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2818     } else {
2819         set_flush_to_zero(true, statusp);
2820         set_flush_inputs_to_zero(true, statusp);
2821         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2822     }
2823     return ebf;
2824 }
2825 
2826 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2827 {
2828     float32 t1, t2;
2829 
2830     /*
2831      * Extract each BFloat16 from the element pair, and shift
2832      * them such that they become float32.
2833      */
2834     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2835     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2836     t1 = float32_add(t1, t2, fpst);
2837     t1 = float32_add(sum, t1, fpst);
2838 
2839     return t1;
2840 }
2841 
2842 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2843                      float_status *fpst, float_status *fpst_odd)
2844 {
2845     /*
2846      * Compare f16_dotadd() in sme_helper.c, but here we have
2847      * bfloat16 inputs. In particular that means that we do not
2848      * want the FPCR.FZ16 flush semantics, so we use the normal
2849      * float_status for the input handling here.
2850      */
2851     float64 e1r = float32_to_float64(e1 << 16, fpst);
2852     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2853     float64 e2r = float32_to_float64(e2 << 16, fpst);
2854     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2855     float64 t64;
2856     float32 t32;
2857 
2858     /*
2859      * The ARM pseudocode function FPDot performs both multiplies
2860      * and the add with a single rounding operation.  Emulate this
2861      * by performing the first multiply in round-to-odd, then doing
2862      * the second multiply as fused multiply-add, and rounding to
2863      * float32 all in one step.
2864      */
2865     t64 = float64_mul(e1r, e2r, fpst_odd);
2866     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2867 
2868     /* This conversion is exact, because we've already rounded. */
2869     t32 = float64_to_float32(t64, fpst);
2870 
2871     /* The final accumulation step is not fused. */
2872     return float32_add(sum, t32, fpst);
2873 }
2874 
2875 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2876                         CPUARMState *env, uint32_t desc)
2877 {
2878     intptr_t i, opr_sz = simd_oprsz(desc);
2879     float32 *d = vd, *a = va;
2880     uint32_t *n = vn, *m = vm;
2881     float_status fpst, fpst_odd;
2882 
2883     if (is_ebf(env, &fpst, &fpst_odd)) {
2884         for (i = 0; i < opr_sz / 4; ++i) {
2885             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2886         }
2887     } else {
2888         for (i = 0; i < opr_sz / 4; ++i) {
2889             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2890         }
2891     }
2892     clear_tail(d, opr_sz, simd_maxsz(desc));
2893 }
2894 
2895 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2896                             void *va, CPUARMState *env, uint32_t desc)
2897 {
2898     intptr_t i, j, opr_sz = simd_oprsz(desc);
2899     intptr_t index = simd_data(desc);
2900     intptr_t elements = opr_sz / 4;
2901     intptr_t eltspersegment = MIN(16 / 4, elements);
2902     float32 *d = vd, *a = va;
2903     uint32_t *n = vn, *m = vm;
2904     float_status fpst, fpst_odd;
2905 
2906     if (is_ebf(env, &fpst, &fpst_odd)) {
2907         for (i = 0; i < elements; i += eltspersegment) {
2908             uint32_t m_idx = m[i + H4(index)];
2909 
2910             for (j = i; j < i + eltspersegment; j++) {
2911                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2912             }
2913         }
2914     } else {
2915         for (i = 0; i < elements; i += eltspersegment) {
2916             uint32_t m_idx = m[i + H4(index)];
2917 
2918             for (j = i; j < i + eltspersegment; j++) {
2919                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2920             }
2921         }
2922     }
2923     clear_tail(d, opr_sz, simd_maxsz(desc));
2924 }
2925 
2926 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2927                          CPUARMState *env, uint32_t desc)
2928 {
2929     intptr_t s, opr_sz = simd_oprsz(desc);
2930     float32 *d = vd, *a = va;
2931     uint32_t *n = vn, *m = vm;
2932     float_status fpst, fpst_odd;
2933 
2934     if (is_ebf(env, &fpst, &fpst_odd)) {
2935         for (s = 0; s < opr_sz / 4; s += 4) {
2936             float32 sum00, sum01, sum10, sum11;
2937 
2938             /*
2939              * Process the entire segment at once, writing back the
2940              * results only after we've consumed all of the inputs.
2941              *
2942              * Key to indices by column:
2943              *               i   j               i   k             j   k
2944              */
2945             sum00 = a[s + H4(0 + 0)];
2946             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2947             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2948 
2949             sum01 = a[s + H4(0 + 1)];
2950             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2951             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2952 
2953             sum10 = a[s + H4(2 + 0)];
2954             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2955             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2956 
2957             sum11 = a[s + H4(2 + 1)];
2958             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2959             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2960 
2961             d[s + H4(0 + 0)] = sum00;
2962             d[s + H4(0 + 1)] = sum01;
2963             d[s + H4(2 + 0)] = sum10;
2964             d[s + H4(2 + 1)] = sum11;
2965         }
2966     } else {
2967         for (s = 0; s < opr_sz / 4; s += 4) {
2968             float32 sum00, sum01, sum10, sum11;
2969 
2970             /*
2971              * Process the entire segment at once, writing back the
2972              * results only after we've consumed all of the inputs.
2973              *
2974              * Key to indices by column:
2975              *               i   j           i   k             j   k
2976              */
2977             sum00 = a[s + H4(0 + 0)];
2978             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2979             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2980 
2981             sum01 = a[s + H4(0 + 1)];
2982             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2983             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2984 
2985             sum10 = a[s + H4(2 + 0)];
2986             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2987             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
2988 
2989             sum11 = a[s + H4(2 + 1)];
2990             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
2991             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
2992 
2993             d[s + H4(0 + 0)] = sum00;
2994             d[s + H4(0 + 1)] = sum01;
2995             d[s + H4(2 + 0)] = sum10;
2996             d[s + H4(2 + 1)] = sum11;
2997         }
2998     }
2999     clear_tail(d, opr_sz, simd_maxsz(desc));
3000 }
3001 
3002 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3003                          float_status *stat, uint32_t desc)
3004 {
3005     intptr_t i, opr_sz = simd_oprsz(desc);
3006     intptr_t sel = simd_data(desc);
3007     float32 *d = vd, *a = va;
3008     bfloat16 *n = vn, *m = vm;
3009 
3010     for (i = 0; i < opr_sz / 4; ++i) {
3011         float32 nn = n[H2(i * 2 + sel)] << 16;
3012         float32 mm = m[H2(i * 2 + sel)] << 16;
3013         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3014     }
3015     clear_tail(d, opr_sz, simd_maxsz(desc));
3016 }
3017 
3018 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3019                              void *va, float_status *stat, uint32_t desc)
3020 {
3021     intptr_t i, j, opr_sz = simd_oprsz(desc);
3022     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3023     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3024     intptr_t elements = opr_sz / 4;
3025     intptr_t eltspersegment = MIN(16 / 4, elements);
3026     float32 *d = vd, *a = va;
3027     bfloat16 *n = vn, *m = vm;
3028 
3029     for (i = 0; i < elements; i += eltspersegment) {
3030         float32 m_idx = m[H2(2 * i + index)] << 16;
3031 
3032         for (j = i; j < i + eltspersegment; j++) {
3033             float32 n_j = n[H2(2 * j + sel)] << 16;
3034             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3035         }
3036     }
3037     clear_tail(d, opr_sz, simd_maxsz(desc));
3038 }
3039 
3040 #define DO_CLAMP(NAME, TYPE) \
3041 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3042 {                                                                       \
3043     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3044     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3045         TYPE aa = *(TYPE *)(a + i);                                     \
3046         TYPE nn = *(TYPE *)(n + i);                                     \
3047         TYPE mm = *(TYPE *)(m + i);                                     \
3048         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3049         *(TYPE *)(d + i) = dd;                                          \
3050     }                                                                   \
3051     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3052 }
3053 
3054 DO_CLAMP(gvec_sclamp_b, int8_t)
3055 DO_CLAMP(gvec_sclamp_h, int16_t)
3056 DO_CLAMP(gvec_sclamp_s, int32_t)
3057 DO_CLAMP(gvec_sclamp_d, int64_t)
3058 
3059 DO_CLAMP(gvec_uclamp_b, uint8_t)
3060 DO_CLAMP(gvec_uclamp_h, uint16_t)
3061 DO_CLAMP(gvec_uclamp_s, uint32_t)
3062 DO_CLAMP(gvec_uclamp_d, uint64_t)
3063 
3064 /* Bit count in each 8-bit word. */
3065 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3066 {
3067     intptr_t i, opr_sz = simd_oprsz(desc);
3068     uint8_t *d = vd, *n = vn;
3069 
3070     for (i = 0; i < opr_sz; ++i) {
3071         d[i] = ctpop8(n[i]);
3072     }
3073     clear_tail(d, opr_sz, simd_maxsz(desc));
3074 }
3075 
3076 /* Reverse bits in each 8 bit word */
3077 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3078 {
3079     intptr_t i, opr_sz = simd_oprsz(desc);
3080     uint64_t *d = vd, *n = vn;
3081 
3082     for (i = 0; i < opr_sz / 8; ++i) {
3083         d[i] = revbit64(bswap64(n[i]));
3084     }
3085     clear_tail(d, opr_sz, simd_maxsz(desc));
3086 }
3087 
3088 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3089 {
3090     intptr_t i, opr_sz = simd_oprsz(desc);
3091     uint32_t *d = vd, *n = vn;
3092 
3093     for (i = 0; i < opr_sz / 4; ++i) {
3094         d[i] = helper_recpe_u32(n[i]);
3095     }
3096     clear_tail(d, opr_sz, simd_maxsz(desc));
3097 }
3098 
3099 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3100 {
3101     intptr_t i, opr_sz = simd_oprsz(desc);
3102     uint32_t *d = vd, *n = vn;
3103 
3104     for (i = 0; i < opr_sz / 4; ++i) {
3105         d[i] = helper_rsqrte_u32(n[i]);
3106     }
3107     clear_tail(d, opr_sz, simd_maxsz(desc));
3108 }
3109