xref: /qemu/target/arm/tcg/vec_helper.c (revision aec7ae42a9f4f3eaf40d66b7be8de8a6da6c9cea)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
883     uint32_t neg_imag = neg_real ^ 1;
884     uintptr_t i;
885 
886     /* Shift boolean to the sign bit so we can xor to negate.  */
887     neg_real <<= 15;
888     neg_imag <<= 15;
889 
890     for (i = 0; i < opr_sz / 2; i += 2) {
891         float16 e0 = n[H2(i)];
892         float16 e1 = m[H2(i + 1)] ^ neg_imag;
893         float16 e2 = n[H2(i + 1)];
894         float16 e3 = m[H2(i)] ^ neg_real;
895 
896         d[H2(i)] = float16_add(e0, e1, fpst);
897         d[H2(i + 1)] = float16_add(e2, e3, fpst);
898     }
899     clear_tail(d, opr_sz, simd_maxsz(desc));
900 }
901 
902 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
903                          float_status *fpst, uint32_t desc)
904 {
905     uintptr_t opr_sz = simd_oprsz(desc);
906     float32 *d = vd;
907     float32 *n = vn;
908     float32 *m = vm;
909     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
910     uint32_t neg_imag = neg_real ^ 1;
911     uintptr_t i;
912 
913     /* Shift boolean to the sign bit so we can xor to negate.  */
914     neg_real <<= 31;
915     neg_imag <<= 31;
916 
917     for (i = 0; i < opr_sz / 4; i += 2) {
918         float32 e0 = n[H4(i)];
919         float32 e1 = m[H4(i + 1)] ^ neg_imag;
920         float32 e2 = n[H4(i + 1)];
921         float32 e3 = m[H4(i)] ^ neg_real;
922 
923         d[H4(i)] = float32_add(e0, e1, fpst);
924         d[H4(i + 1)] = float32_add(e2, e3, fpst);
925     }
926     clear_tail(d, opr_sz, simd_maxsz(desc));
927 }
928 
929 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
930                          float_status *fpst, uint32_t desc)
931 {
932     uintptr_t opr_sz = simd_oprsz(desc);
933     float64 *d = vd;
934     float64 *n = vn;
935     float64 *m = vm;
936     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
937     uint64_t neg_imag = neg_real ^ 1;
938     uintptr_t i;
939 
940     /* Shift boolean to the sign bit so we can xor to negate.  */
941     neg_real <<= 63;
942     neg_imag <<= 63;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1] ^ neg_imag;
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i] ^ neg_real;
949 
950         d[i] = float64_add(e0, e1, fpst);
951         d[i + 1] = float64_add(e2, e3, fpst);
952     }
953     clear_tail(d, opr_sz, simd_maxsz(desc));
954 }
955 
956 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
957                          float_status *fpst, uint32_t desc)
958 {
959     uintptr_t opr_sz = simd_oprsz(desc);
960     float16 *d = vd, *n = vn, *m = vm, *a = va;
961     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
962     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
963     uint32_t neg_real = flip ^ neg_imag;
964     uintptr_t i;
965 
966     /* Shift boolean to the sign bit so we can xor to negate.  */
967     neg_real <<= 15;
968     neg_imag <<= 15;
969 
970     for (i = 0; i < opr_sz / 2; i += 2) {
971         float16 e2 = n[H2(i + flip)];
972         float16 e1 = m[H2(i + flip)] ^ neg_real;
973         float16 e4 = e2;
974         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
975 
976         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
977         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
978     }
979     clear_tail(d, opr_sz, simd_maxsz(desc));
980 }
981 
982 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
983                              float_status *fpst, uint32_t desc)
984 {
985     uintptr_t opr_sz = simd_oprsz(desc);
986     float16 *d = vd, *n = vn, *m = vm, *a = va;
987     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
988     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
989     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
990     uint32_t neg_real = flip ^ neg_imag;
991     intptr_t elements = opr_sz / sizeof(float16);
992     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
993     intptr_t i, j;
994 
995     /* Shift boolean to the sign bit so we can xor to negate.  */
996     neg_real <<= 15;
997     neg_imag <<= 15;
998 
999     for (i = 0; i < elements; i += eltspersegment) {
1000         float16 mr = m[H2(i + 2 * index + 0)];
1001         float16 mi = m[H2(i + 2 * index + 1)];
1002         float16 e1 = neg_real ^ (flip ? mi : mr);
1003         float16 e3 = neg_imag ^ (flip ? mr : mi);
1004 
1005         for (j = i; j < i + eltspersegment; j += 2) {
1006             float16 e2 = n[H2(j + flip)];
1007             float16 e4 = e2;
1008 
1009             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1010             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1011         }
1012     }
1013     clear_tail(d, opr_sz, simd_maxsz(desc));
1014 }
1015 
1016 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1017                          float_status *fpst, uint32_t desc)
1018 {
1019     uintptr_t opr_sz = simd_oprsz(desc);
1020     float32 *d = vd, *n = vn, *m = vm, *a = va;
1021     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1022     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1023     uint32_t neg_real = flip ^ neg_imag;
1024     uintptr_t i;
1025 
1026     /* Shift boolean to the sign bit so we can xor to negate.  */
1027     neg_real <<= 31;
1028     neg_imag <<= 31;
1029 
1030     for (i = 0; i < opr_sz / 4; i += 2) {
1031         float32 e2 = n[H4(i + flip)];
1032         float32 e1 = m[H4(i + flip)] ^ neg_real;
1033         float32 e4 = e2;
1034         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1035 
1036         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1037         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1038     }
1039     clear_tail(d, opr_sz, simd_maxsz(desc));
1040 }
1041 
1042 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1043                              float_status *fpst, uint32_t desc)
1044 {
1045     uintptr_t opr_sz = simd_oprsz(desc);
1046     float32 *d = vd, *n = vn, *m = vm, *a = va;
1047     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1048     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1049     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1050     uint32_t neg_real = flip ^ neg_imag;
1051     intptr_t elements = opr_sz / sizeof(float32);
1052     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1053     intptr_t i, j;
1054 
1055     /* Shift boolean to the sign bit so we can xor to negate.  */
1056     neg_real <<= 31;
1057     neg_imag <<= 31;
1058 
1059     for (i = 0; i < elements; i += eltspersegment) {
1060         float32 mr = m[H4(i + 2 * index + 0)];
1061         float32 mi = m[H4(i + 2 * index + 1)];
1062         float32 e1 = neg_real ^ (flip ? mi : mr);
1063         float32 e3 = neg_imag ^ (flip ? mr : mi);
1064 
1065         for (j = i; j < i + eltspersegment; j += 2) {
1066             float32 e2 = n[H4(j + flip)];
1067             float32 e4 = e2;
1068 
1069             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1070             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1071         }
1072     }
1073     clear_tail(d, opr_sz, simd_maxsz(desc));
1074 }
1075 
1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1077                          float_status *fpst, uint32_t desc)
1078 {
1079     uintptr_t opr_sz = simd_oprsz(desc);
1080     float64 *d = vd, *n = vn, *m = vm, *a = va;
1081     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1082     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1083     uint64_t neg_real = flip ^ neg_imag;
1084     uintptr_t i;
1085 
1086     /* Shift boolean to the sign bit so we can xor to negate.  */
1087     neg_real <<= 63;
1088     neg_imag <<= 63;
1089 
1090     for (i = 0; i < opr_sz / 8; i += 2) {
1091         float64 e2 = n[i + flip];
1092         float64 e1 = m[i + flip] ^ neg_real;
1093         float64 e4 = e2;
1094         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1095 
1096         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1097         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1098     }
1099     clear_tail(d, opr_sz, simd_maxsz(desc));
1100 }
1101 
1102 /*
1103  * Floating point comparisons producing an integer result (all 1s or all 0s).
1104  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1105  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1106  */
1107 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1108 {
1109     return -float16_eq_quiet(op1, op2, stat);
1110 }
1111 
1112 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1113 {
1114     return -float32_eq_quiet(op1, op2, stat);
1115 }
1116 
1117 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1118 {
1119     return -float64_eq_quiet(op1, op2, stat);
1120 }
1121 
1122 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1123 {
1124     return -float16_le(op2, op1, stat);
1125 }
1126 
1127 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1128 {
1129     return -float32_le(op2, op1, stat);
1130 }
1131 
1132 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1133 {
1134     return -float64_le(op2, op1, stat);
1135 }
1136 
1137 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1138 {
1139     return -float16_lt(op2, op1, stat);
1140 }
1141 
1142 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1143 {
1144     return -float32_lt(op2, op1, stat);
1145 }
1146 
1147 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1148 {
1149     return -float64_lt(op2, op1, stat);
1150 }
1151 
1152 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1153 {
1154     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1155 }
1156 
1157 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1158 {
1159     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1160 }
1161 
1162 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1163 {
1164     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1165 }
1166 
1167 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1168 {
1169     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1170 }
1171 
1172 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1173 {
1174     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1175 }
1176 
1177 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1178 {
1179     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1180 }
1181 
1182 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1183 {
1184     if (float16_is_any_nan(x)) {
1185         float_raise(float_flag_invalid, fpst);
1186         return 0;
1187     }
1188     return float16_to_int16_round_to_zero(x, fpst);
1189 }
1190 
1191 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1192 {
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_uint16_round_to_zero(x, fpst);
1198 }
1199 
1200 #define DO_2OP(NAME, FUNC, TYPE) \
1201 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1202 {                                                                 \
1203     intptr_t i, oprsz = simd_oprsz(desc);                         \
1204     TYPE *d = vd, *n = vn;                                        \
1205     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1206         d[i] = FUNC(n[i], stat);                                  \
1207     }                                                             \
1208     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1209 }
1210 
1211 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1212 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1213 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1214 
1215 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1216 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1217 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1218 
1219 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1220 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1221 
1222 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1223 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1224 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1225 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1226 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1227 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1228 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1229 DO_2OP(gvec_touszh, vfp_touszh, float16)
1230 
1231 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1232     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1233     {                                                           \
1234         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1235     }
1236 
1237 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1238     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1239     {                                                           \
1240         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1241     }
1242 
1243 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1244     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1245     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1246     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1247     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1248     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1249     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1250 
1251 DO_2OP_CMP0(cgt, cgt, FWD)
1252 DO_2OP_CMP0(cge, cge, FWD)
1253 DO_2OP_CMP0(ceq, ceq, FWD)
1254 DO_2OP_CMP0(clt, cgt, REV)
1255 DO_2OP_CMP0(cle, cge, REV)
1256 
1257 #undef DO_2OP
1258 #undef DO_2OP_CMP0
1259 
1260 /* Floating-point trigonometric starting value.
1261  * See the ARM ARM pseudocode function FPTrigSMul.
1262  */
1263 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1264 {
1265     float16 result = float16_mul(op1, op1, stat);
1266     if (!float16_is_any_nan(result)) {
1267         result = float16_set_sign(result, op2 & 1);
1268     }
1269     return result;
1270 }
1271 
1272 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1273 {
1274     float32 result = float32_mul(op1, op1, stat);
1275     if (!float32_is_any_nan(result)) {
1276         result = float32_set_sign(result, op2 & 1);
1277     }
1278     return result;
1279 }
1280 
1281 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1282 {
1283     float64 result = float64_mul(op1, op1, stat);
1284     if (!float64_is_any_nan(result)) {
1285         result = float64_set_sign(result, op2 & 1);
1286     }
1287     return result;
1288 }
1289 
1290 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1291 {
1292     return float16_abs(float16_sub(op1, op2, stat));
1293 }
1294 
1295 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1296 {
1297     return float32_abs(float32_sub(op1, op2, stat));
1298 }
1299 
1300 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1301 {
1302     return float64_abs(float64_sub(op1, op2, stat));
1303 }
1304 
1305 /*
1306  * Reciprocal step. These are the AArch32 version which uses a
1307  * non-fused multiply-and-subtract.
1308  */
1309 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1310 {
1311     op1 = float16_squash_input_denormal(op1, stat);
1312     op2 = float16_squash_input_denormal(op2, stat);
1313 
1314     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1315         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1316         return float16_two;
1317     }
1318     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1319 }
1320 
1321 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1322 {
1323     op1 = float32_squash_input_denormal(op1, stat);
1324     op2 = float32_squash_input_denormal(op2, stat);
1325 
1326     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1327         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1328         return float32_two;
1329     }
1330     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1331 }
1332 
1333 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1334 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1335 {
1336     op1 = float16_squash_input_denormal(op1, stat);
1337     op2 = float16_squash_input_denormal(op2, stat);
1338 
1339     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1340         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1341         return float16_one_point_five;
1342     }
1343     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1344     return float16_div(op1, float16_two, stat);
1345 }
1346 
1347 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1348 {
1349     op1 = float32_squash_input_denormal(op1, stat);
1350     op2 = float32_squash_input_denormal(op2, stat);
1351 
1352     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1353         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1354         return float32_one_point_five;
1355     }
1356     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1357     return float32_div(op1, float32_two, stat);
1358 }
1359 
1360 #define DO_3OP(NAME, FUNC, TYPE) \
1361 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1362                   float_status *stat, uint32_t desc)                       \
1363 {                                                                          \
1364     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1365     TYPE *d = vd, *n = vn, *m = vm;                                        \
1366     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1367         d[i] = FUNC(n[i], m[i], stat);                                     \
1368     }                                                                      \
1369     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1370 }
1371 
1372 DO_3OP(gvec_fadd_h, float16_add, float16)
1373 DO_3OP(gvec_fadd_s, float32_add, float32)
1374 DO_3OP(gvec_fadd_d, float64_add, float64)
1375 
1376 DO_3OP(gvec_fsub_h, float16_sub, float16)
1377 DO_3OP(gvec_fsub_s, float32_sub, float32)
1378 DO_3OP(gvec_fsub_d, float64_sub, float64)
1379 
1380 DO_3OP(gvec_fmul_h, float16_mul, float16)
1381 DO_3OP(gvec_fmul_s, float32_mul, float32)
1382 DO_3OP(gvec_fmul_d, float64_mul, float64)
1383 
1384 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1385 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1386 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1387 
1388 DO_3OP(gvec_fabd_h, float16_abd, float16)
1389 DO_3OP(gvec_fabd_s, float32_abd, float32)
1390 DO_3OP(gvec_fabd_d, float64_abd, float64)
1391 
1392 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1393 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1394 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1395 
1396 DO_3OP(gvec_fcge_h, float16_cge, float16)
1397 DO_3OP(gvec_fcge_s, float32_cge, float32)
1398 DO_3OP(gvec_fcge_d, float64_cge, float64)
1399 
1400 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1401 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1402 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1403 
1404 DO_3OP(gvec_facge_h, float16_acge, float16)
1405 DO_3OP(gvec_facge_s, float32_acge, float32)
1406 DO_3OP(gvec_facge_d, float64_acge, float64)
1407 
1408 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1409 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1410 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1411 
1412 DO_3OP(gvec_fmax_h, float16_max, float16)
1413 DO_3OP(gvec_fmax_s, float32_max, float32)
1414 DO_3OP(gvec_fmax_d, float64_max, float64)
1415 
1416 DO_3OP(gvec_fmin_h, float16_min, float16)
1417 DO_3OP(gvec_fmin_s, float32_min, float32)
1418 DO_3OP(gvec_fmin_d, float64_min, float64)
1419 
1420 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1421 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1422 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1423 
1424 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1425 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1426 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1427 
1428 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1429 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1430 
1431 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1432 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1433 
1434 #ifdef TARGET_AARCH64
1435 DO_3OP(gvec_fdiv_h, float16_div, float16)
1436 DO_3OP(gvec_fdiv_s, float32_div, float32)
1437 DO_3OP(gvec_fdiv_d, float64_div, float64)
1438 
1439 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1440 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1441 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1442 
1443 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1444 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1445 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1446 
1447 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1448 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1449 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1450 
1451 #endif
1452 #undef DO_3OP
1453 
1454 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1455 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1456                                  float_status *stat)
1457 {
1458     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1459 }
1460 
1461 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1462                                  float_status *stat)
1463 {
1464     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1465 }
1466 
1467 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1468                                  float_status *stat)
1469 {
1470     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1471 }
1472 
1473 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1474                                  float_status *stat)
1475 {
1476     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1477 }
1478 
1479 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1480 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1481                                 float_status *stat)
1482 {
1483     return float16_muladd(op1, op2, dest, 0, stat);
1484 }
1485 
1486 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1487                                  float_status *stat)
1488 {
1489     return float32_muladd(op1, op2, dest, 0, stat);
1490 }
1491 
1492 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1493                                  float_status *stat)
1494 {
1495     return float64_muladd(op1, op2, dest, 0, stat);
1496 }
1497 
1498 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1499                                  float_status *stat)
1500 {
1501     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1502 }
1503 
1504 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1505                                  float_status *stat)
1506 {
1507     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1508 }
1509 
1510 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1511                                  float_status *stat)
1512 {
1513     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1514 }
1515 
1516 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1517 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1518                   float_status *stat, uint32_t desc)                       \
1519 {                                                                          \
1520     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1521     TYPE *d = vd, *n = vn, *m = vm;                                        \
1522     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1523         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1524     }                                                                      \
1525     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1526 }
1527 
1528 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1529 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1530 
1531 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1532 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1533 
1534 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1535 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1536 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1537 
1538 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1539 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1540 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1541 
1542 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1543  * For AdvSIMD, there is of course only one such vector segment.
1544  */
1545 
1546 #define DO_MUL_IDX(NAME, TYPE, H) \
1547 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1548 {                                                                          \
1549     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1550     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1551     intptr_t idx = simd_data(desc);                                        \
1552     TYPE *d = vd, *n = vn, *m = vm;                                        \
1553     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1554         TYPE mm = m[H(i + idx)];                                           \
1555         for (j = 0; j < segment; j++) {                                    \
1556             d[i + j] = n[i + j] * mm;                                      \
1557         }                                                                  \
1558     }                                                                      \
1559     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1560 }
1561 
1562 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1563 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1564 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1565 
1566 #undef DO_MUL_IDX
1567 
1568 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1569 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1570 {                                                                          \
1571     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1572     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1573     intptr_t idx = simd_data(desc);                                        \
1574     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1575     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1576         TYPE mm = m[H(i + idx)];                                           \
1577         for (j = 0; j < segment; j++) {                                    \
1578             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1579         }                                                                  \
1580     }                                                                      \
1581     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1582 }
1583 
1584 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1585 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1586 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1587 
1588 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1589 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1590 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1591 
1592 #undef DO_MLA_IDX
1593 
1594 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1595 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1596                   float_status *stat, uint32_t desc)                       \
1597 {                                                                          \
1598     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1599     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1600     intptr_t idx = simd_data(desc);                                        \
1601     TYPE *d = vd, *n = vn, *m = vm;                                        \
1602     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1603         TYPE mm = m[H(i + idx)];                                           \
1604         for (j = 0; j < segment; j++) {                                    \
1605             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1606         }                                                                  \
1607     }                                                                      \
1608     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1609 }
1610 
1611 #define nop(N, M, S) (M)
1612 
1613 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1614 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1615 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1616 
1617 #ifdef TARGET_AARCH64
1618 
1619 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1620 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1621 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1622 
1623 #endif
1624 
1625 #undef nop
1626 
1627 /*
1628  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1629  * the fused ops below they assume accumulate both from and into Vd.
1630  */
1631 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1632 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1633 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1634 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1635 
1636 #undef DO_FMUL_IDX
1637 
1638 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1639 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1640                   float_status *stat, uint32_t desc)                       \
1641 {                                                                          \
1642     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1643     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1644     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1645     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1646     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1647     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1648     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1649         TYPE mm = m[H(i + idx)];                                           \
1650         for (j = 0; j < segment; j++) {                                    \
1651             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1652                                      mm, a[i + j], 0, stat);               \
1653         }                                                                  \
1654     }                                                                      \
1655     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1656 }
1657 
1658 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1659 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1660 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1661 
1662 #undef DO_FMLA_IDX
1663 
1664 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1665 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1666 {                                                                          \
1667     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1668     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1669     bool q = false;                                                        \
1670     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1671         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1672         if (dd < MIN) {                                                    \
1673             dd = MIN;                                                      \
1674             q = true;                                                      \
1675         } else if (dd > MAX) {                                             \
1676             dd = MAX;                                                      \
1677             q = true;                                                      \
1678         }                                                                  \
1679         d[i] = dd;                                                         \
1680     }                                                                      \
1681     if (q) {                                                               \
1682         uint32_t *qc = vq;                                                 \
1683         qc[0] = 1;                                                         \
1684     }                                                                      \
1685     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1686 }
1687 
1688 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1689 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1690 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1691 
1692 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1693 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1694 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1695 
1696 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1697 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1698 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1699 
1700 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1701 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1702 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1703 
1704 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1705 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1706 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1707 
1708 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1709 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1710 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1711 
1712 #undef DO_SAT
1713 
1714 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1715                           void *vm, uint32_t desc)
1716 {
1717     intptr_t i, oprsz = simd_oprsz(desc);
1718     uint64_t *d = vd, *n = vn, *m = vm;
1719     bool q = false;
1720 
1721     for (i = 0; i < oprsz / 8; i++) {
1722         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1723         if (dd < nn) {
1724             dd = UINT64_MAX;
1725             q = true;
1726         }
1727         d[i] = dd;
1728     }
1729     if (q) {
1730         uint32_t *qc = vq;
1731         qc[0] = 1;
1732     }
1733     clear_tail(d, oprsz, simd_maxsz(desc));
1734 }
1735 
1736 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1737                           void *vm, uint32_t desc)
1738 {
1739     intptr_t i, oprsz = simd_oprsz(desc);
1740     uint64_t *d = vd, *n = vn, *m = vm;
1741     bool q = false;
1742 
1743     for (i = 0; i < oprsz / 8; i++) {
1744         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1745         if (nn < mm) {
1746             dd = 0;
1747             q = true;
1748         }
1749         d[i] = dd;
1750     }
1751     if (q) {
1752         uint32_t *qc = vq;
1753         qc[0] = 1;
1754     }
1755     clear_tail(d, oprsz, simd_maxsz(desc));
1756 }
1757 
1758 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1759                           void *vm, uint32_t desc)
1760 {
1761     intptr_t i, oprsz = simd_oprsz(desc);
1762     int64_t *d = vd, *n = vn, *m = vm;
1763     bool q = false;
1764 
1765     for (i = 0; i < oprsz / 8; i++) {
1766         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1767         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1768             dd = (nn >> 63) ^ ~INT64_MIN;
1769             q = true;
1770         }
1771         d[i] = dd;
1772     }
1773     if (q) {
1774         uint32_t *qc = vq;
1775         qc[0] = 1;
1776     }
1777     clear_tail(d, oprsz, simd_maxsz(desc));
1778 }
1779 
1780 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1781                           void *vm, uint32_t desc)
1782 {
1783     intptr_t i, oprsz = simd_oprsz(desc);
1784     int64_t *d = vd, *n = vn, *m = vm;
1785     bool q = false;
1786 
1787     for (i = 0; i < oprsz / 8; i++) {
1788         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1789         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1790             dd = (nn >> 63) ^ ~INT64_MIN;
1791             q = true;
1792         }
1793         d[i] = dd;
1794     }
1795     if (q) {
1796         uint32_t *qc = vq;
1797         qc[0] = 1;
1798     }
1799     clear_tail(d, oprsz, simd_maxsz(desc));
1800 }
1801 
1802 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1803                            void *vm, uint32_t desc)
1804 {
1805     intptr_t i, oprsz = simd_oprsz(desc);
1806     uint64_t *d = vd, *n = vn, *m = vm;
1807     bool q = false;
1808 
1809     for (i = 0; i < oprsz / 8; i++) {
1810         uint64_t nn = n[i];
1811         int64_t mm = m[i];
1812         uint64_t dd = nn + mm;
1813 
1814         if (mm < 0) {
1815             if (nn < (uint64_t)-mm) {
1816                 dd = 0;
1817                 q = true;
1818             }
1819         } else {
1820             if (dd < nn) {
1821                 dd = UINT64_MAX;
1822                 q = true;
1823             }
1824         }
1825         d[i] = dd;
1826     }
1827     if (q) {
1828         uint32_t *qc = vq;
1829         qc[0] = 1;
1830     }
1831     clear_tail(d, oprsz, simd_maxsz(desc));
1832 }
1833 
1834 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1835                            void *vm, uint32_t desc)
1836 {
1837     intptr_t i, oprsz = simd_oprsz(desc);
1838     uint64_t *d = vd, *n = vn, *m = vm;
1839     bool q = false;
1840 
1841     for (i = 0; i < oprsz / 8; i++) {
1842         int64_t nn = n[i];
1843         uint64_t mm = m[i];
1844         int64_t dd = nn + mm;
1845 
1846         if (mm > (uint64_t)(INT64_MAX - nn)) {
1847             dd = INT64_MAX;
1848             q = true;
1849         }
1850         d[i] = dd;
1851     }
1852     if (q) {
1853         uint32_t *qc = vq;
1854         qc[0] = 1;
1855     }
1856     clear_tail(d, oprsz, simd_maxsz(desc));
1857 }
1858 
1859 #define DO_SRA(NAME, TYPE)                              \
1860 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1861 {                                                       \
1862     intptr_t i, oprsz = simd_oprsz(desc);               \
1863     int shift = simd_data(desc);                        \
1864     TYPE *d = vd, *n = vn;                              \
1865     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1866         d[i] += n[i] >> shift;                          \
1867     }                                                   \
1868     clear_tail(d, oprsz, simd_maxsz(desc));             \
1869 }
1870 
1871 DO_SRA(gvec_ssra_b, int8_t)
1872 DO_SRA(gvec_ssra_h, int16_t)
1873 DO_SRA(gvec_ssra_s, int32_t)
1874 DO_SRA(gvec_ssra_d, int64_t)
1875 
1876 DO_SRA(gvec_usra_b, uint8_t)
1877 DO_SRA(gvec_usra_h, uint16_t)
1878 DO_SRA(gvec_usra_s, uint32_t)
1879 DO_SRA(gvec_usra_d, uint64_t)
1880 
1881 #undef DO_SRA
1882 
1883 #define DO_RSHR(NAME, TYPE)                             \
1884 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1885 {                                                       \
1886     intptr_t i, oprsz = simd_oprsz(desc);               \
1887     int shift = simd_data(desc);                        \
1888     TYPE *d = vd, *n = vn;                              \
1889     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1890         TYPE tmp = n[i] >> (shift - 1);                 \
1891         d[i] = (tmp >> 1) + (tmp & 1);                  \
1892     }                                                   \
1893     clear_tail(d, oprsz, simd_maxsz(desc));             \
1894 }
1895 
1896 DO_RSHR(gvec_srshr_b, int8_t)
1897 DO_RSHR(gvec_srshr_h, int16_t)
1898 DO_RSHR(gvec_srshr_s, int32_t)
1899 DO_RSHR(gvec_srshr_d, int64_t)
1900 
1901 DO_RSHR(gvec_urshr_b, uint8_t)
1902 DO_RSHR(gvec_urshr_h, uint16_t)
1903 DO_RSHR(gvec_urshr_s, uint32_t)
1904 DO_RSHR(gvec_urshr_d, uint64_t)
1905 
1906 #undef DO_RSHR
1907 
1908 #define DO_RSRA(NAME, TYPE)                             \
1909 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1910 {                                                       \
1911     intptr_t i, oprsz = simd_oprsz(desc);               \
1912     int shift = simd_data(desc);                        \
1913     TYPE *d = vd, *n = vn;                              \
1914     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1915         TYPE tmp = n[i] >> (shift - 1);                 \
1916         d[i] += (tmp >> 1) + (tmp & 1);                 \
1917     }                                                   \
1918     clear_tail(d, oprsz, simd_maxsz(desc));             \
1919 }
1920 
1921 DO_RSRA(gvec_srsra_b, int8_t)
1922 DO_RSRA(gvec_srsra_h, int16_t)
1923 DO_RSRA(gvec_srsra_s, int32_t)
1924 DO_RSRA(gvec_srsra_d, int64_t)
1925 
1926 DO_RSRA(gvec_ursra_b, uint8_t)
1927 DO_RSRA(gvec_ursra_h, uint16_t)
1928 DO_RSRA(gvec_ursra_s, uint32_t)
1929 DO_RSRA(gvec_ursra_d, uint64_t)
1930 
1931 #undef DO_RSRA
1932 
1933 #define DO_SRI(NAME, TYPE)                              \
1934 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1935 {                                                       \
1936     intptr_t i, oprsz = simd_oprsz(desc);               \
1937     int shift = simd_data(desc);                        \
1938     TYPE *d = vd, *n = vn;                              \
1939     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1940         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1941     }                                                   \
1942     clear_tail(d, oprsz, simd_maxsz(desc));             \
1943 }
1944 
1945 DO_SRI(gvec_sri_b, uint8_t)
1946 DO_SRI(gvec_sri_h, uint16_t)
1947 DO_SRI(gvec_sri_s, uint32_t)
1948 DO_SRI(gvec_sri_d, uint64_t)
1949 
1950 #undef DO_SRI
1951 
1952 #define DO_SLI(NAME, TYPE)                              \
1953 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1954 {                                                       \
1955     intptr_t i, oprsz = simd_oprsz(desc);               \
1956     int shift = simd_data(desc);                        \
1957     TYPE *d = vd, *n = vn;                              \
1958     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1959         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1960     }                                                   \
1961     clear_tail(d, oprsz, simd_maxsz(desc));             \
1962 }
1963 
1964 DO_SLI(gvec_sli_b, uint8_t)
1965 DO_SLI(gvec_sli_h, uint16_t)
1966 DO_SLI(gvec_sli_s, uint32_t)
1967 DO_SLI(gvec_sli_d, uint64_t)
1968 
1969 #undef DO_SLI
1970 
1971 /*
1972  * Convert float16 to float32, raising no exceptions and
1973  * preserving exceptional values, including SNaN.
1974  * This is effectively an unpack+repack operation.
1975  */
1976 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1977 {
1978     const int f16_bias = 15;
1979     const int f32_bias = 127;
1980     uint32_t sign = extract32(f16, 15, 1);
1981     uint32_t exp = extract32(f16, 10, 5);
1982     uint32_t frac = extract32(f16, 0, 10);
1983 
1984     if (exp == 0x1f) {
1985         /* Inf or NaN */
1986         exp = 0xff;
1987     } else if (exp == 0) {
1988         /* Zero or denormal.  */
1989         if (frac != 0) {
1990             if (fz16) {
1991                 frac = 0;
1992             } else {
1993                 /*
1994                  * Denormal; these are all normal float32.
1995                  * Shift the fraction so that the msb is at bit 11,
1996                  * then remove bit 11 as the implicit bit of the
1997                  * normalized float32.  Note that we still go through
1998                  * the shift for normal numbers below, to put the
1999                  * float32 fraction at the right place.
2000                  */
2001                 int shift = clz32(frac) - 21;
2002                 frac = (frac << shift) & 0x3ff;
2003                 exp = f32_bias - f16_bias - shift + 1;
2004             }
2005         }
2006     } else {
2007         /* Normal number; adjust the bias.  */
2008         exp += f32_bias - f16_bias;
2009     }
2010     sign <<= 31;
2011     exp <<= 23;
2012     frac <<= 23 - 10;
2013 
2014     return sign | exp | frac;
2015 }
2016 
2017 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2018 {
2019     /*
2020      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2021      * Load the 2nd qword iff is_q & is_2.
2022      * Shift to the 2nd dword iff !is_q & is_2.
2023      * For !is_q & !is_2, the upper bits of the result are garbage.
2024      */
2025     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2026 }
2027 
2028 /*
2029  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2030  * as there is not yet SVE versions that might use blocking.
2031  */
2032 
2033 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2034                      uint32_t desc, bool fz16)
2035 {
2036     intptr_t i, oprsz = simd_oprsz(desc);
2037     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2038     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2039     int is_q = oprsz == 16;
2040     uint64_t n_4, m_4;
2041 
2042     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2043     n_4 = load4_f16(vn, is_q, is_2);
2044     m_4 = load4_f16(vm, is_q, is_2);
2045 
2046     /* Negate all inputs for FMLSL at once.  */
2047     if (is_s) {
2048         n_4 ^= 0x8000800080008000ull;
2049     }
2050 
2051     for (i = 0; i < oprsz / 4; i++) {
2052         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2053         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2054         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2055     }
2056     clear_tail(d, oprsz, simd_maxsz(desc));
2057 }
2058 
2059 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2060                             void *venv, uint32_t desc)
2061 {
2062     CPUARMState *env = venv;
2063     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2064              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2065 }
2066 
2067 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2068                             void *venv, uint32_t desc)
2069 {
2070     CPUARMState *env = venv;
2071     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2072              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2073 }
2074 
2075 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2076                                void *venv, uint32_t desc)
2077 {
2078     intptr_t i, oprsz = simd_oprsz(desc);
2079     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2080     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2081     CPUARMState *env = venv;
2082     float_status *status = &env->vfp.fp_status;
2083     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2084 
2085     for (i = 0; i < oprsz; i += sizeof(float32)) {
2086         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2087         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2088         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2089         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2090         float32 aa = *(float32 *)(va + H1_4(i));
2091 
2092         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2093     }
2094 }
2095 
2096 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2097                          uint32_t desc, bool fz16)
2098 {
2099     intptr_t i, oprsz = simd_oprsz(desc);
2100     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2101     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2102     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2103     int is_q = oprsz == 16;
2104     uint64_t n_4;
2105     float32 m_1;
2106 
2107     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2108     n_4 = load4_f16(vn, is_q, is_2);
2109 
2110     /* Negate all inputs for FMLSL at once.  */
2111     if (is_s) {
2112         n_4 ^= 0x8000800080008000ull;
2113     }
2114 
2115     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2116 
2117     for (i = 0; i < oprsz / 4; i++) {
2118         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2119         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2120     }
2121     clear_tail(d, oprsz, simd_maxsz(desc));
2122 }
2123 
2124 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2125                                 void *venv, uint32_t desc)
2126 {
2127     CPUARMState *env = venv;
2128     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2129                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2130 }
2131 
2132 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2133                                 void *venv, uint32_t desc)
2134 {
2135     CPUARMState *env = venv;
2136     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2137                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2138 }
2139 
2140 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2141                                void *venv, uint32_t desc)
2142 {
2143     intptr_t i, j, oprsz = simd_oprsz(desc);
2144     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2145     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2146     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2147     CPUARMState *env = venv;
2148     float_status *status = &env->vfp.fp_status;
2149     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2150 
2151     for (i = 0; i < oprsz; i += 16) {
2152         float16 mm_16 = *(float16 *)(vm + i + idx);
2153         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2154 
2155         for (j = 0; j < 16; j += sizeof(float32)) {
2156             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2157             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2158             float32 aa = *(float32 *)(va + H1_4(i + j));
2159 
2160             *(float32 *)(vd + H1_4(i + j)) =
2161                 float32_muladd(nn, mm, aa, 0, status);
2162         }
2163     }
2164 }
2165 
2166 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2167 {
2168     intptr_t i, opr_sz = simd_oprsz(desc);
2169     int8_t *d = vd, *n = vn, *m = vm;
2170 
2171     for (i = 0; i < opr_sz; ++i) {
2172         int8_t mm = m[i];
2173         int8_t nn = n[i];
2174         int8_t res = 0;
2175         if (mm >= 0) {
2176             if (mm < 8) {
2177                 res = nn << mm;
2178             }
2179         } else {
2180             res = nn >> (mm > -8 ? -mm : 7);
2181         }
2182         d[i] = res;
2183     }
2184     clear_tail(d, opr_sz, simd_maxsz(desc));
2185 }
2186 
2187 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2188 {
2189     intptr_t i, opr_sz = simd_oprsz(desc);
2190     int16_t *d = vd, *n = vn, *m = vm;
2191 
2192     for (i = 0; i < opr_sz / 2; ++i) {
2193         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2194         int16_t nn = n[i];
2195         int16_t res = 0;
2196         if (mm >= 0) {
2197             if (mm < 16) {
2198                 res = nn << mm;
2199             }
2200         } else {
2201             res = nn >> (mm > -16 ? -mm : 15);
2202         }
2203         d[i] = res;
2204     }
2205     clear_tail(d, opr_sz, simd_maxsz(desc));
2206 }
2207 
2208 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2209 {
2210     intptr_t i, opr_sz = simd_oprsz(desc);
2211     uint8_t *d = vd, *n = vn, *m = vm;
2212 
2213     for (i = 0; i < opr_sz; ++i) {
2214         int8_t mm = m[i];
2215         uint8_t nn = n[i];
2216         uint8_t res = 0;
2217         if (mm >= 0) {
2218             if (mm < 8) {
2219                 res = nn << mm;
2220             }
2221         } else {
2222             if (mm > -8) {
2223                 res = nn >> -mm;
2224             }
2225         }
2226         d[i] = res;
2227     }
2228     clear_tail(d, opr_sz, simd_maxsz(desc));
2229 }
2230 
2231 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2232 {
2233     intptr_t i, opr_sz = simd_oprsz(desc);
2234     uint16_t *d = vd, *n = vn, *m = vm;
2235 
2236     for (i = 0; i < opr_sz / 2; ++i) {
2237         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2238         uint16_t nn = n[i];
2239         uint16_t res = 0;
2240         if (mm >= 0) {
2241             if (mm < 16) {
2242                 res = nn << mm;
2243             }
2244         } else {
2245             if (mm > -16) {
2246                 res = nn >> -mm;
2247             }
2248         }
2249         d[i] = res;
2250     }
2251     clear_tail(d, opr_sz, simd_maxsz(desc));
2252 }
2253 
2254 /*
2255  * 8x8->8 polynomial multiply.
2256  *
2257  * Polynomial multiplication is like integer multiplication except the
2258  * partial products are XORed, not added.
2259  *
2260  * TODO: expose this as a generic vector operation, as it is a common
2261  * crypto building block.
2262  */
2263 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2264 {
2265     intptr_t i, opr_sz = simd_oprsz(desc);
2266     uint64_t *d = vd, *n = vn, *m = vm;
2267 
2268     for (i = 0; i < opr_sz / 8; ++i) {
2269         d[i] = clmul_8x8_low(n[i], m[i]);
2270     }
2271     clear_tail(d, opr_sz, simd_maxsz(desc));
2272 }
2273 
2274 /*
2275  * 64x64->128 polynomial multiply.
2276  * Because of the lanes are not accessed in strict columns,
2277  * this probably cannot be turned into a generic helper.
2278  */
2279 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2280 {
2281     intptr_t i, opr_sz = simd_oprsz(desc);
2282     intptr_t hi = simd_data(desc);
2283     uint64_t *d = vd, *n = vn, *m = vm;
2284 
2285     for (i = 0; i < opr_sz / 8; i += 2) {
2286         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2287         d[i] = int128_getlo(r);
2288         d[i + 1] = int128_gethi(r);
2289     }
2290     clear_tail(d, opr_sz, simd_maxsz(desc));
2291 }
2292 
2293 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2294 {
2295     int hi = simd_data(desc);
2296     uint64_t *d = vd, *n = vn, *m = vm;
2297     uint64_t nn = n[hi], mm = m[hi];
2298 
2299     d[0] = clmul_8x4_packed(nn, mm);
2300     nn >>= 32;
2301     mm >>= 32;
2302     d[1] = clmul_8x4_packed(nn, mm);
2303 
2304     clear_tail(d, 16, simd_maxsz(desc));
2305 }
2306 
2307 #ifdef TARGET_AARCH64
2308 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2309 {
2310     int shift = simd_data(desc) * 8;
2311     intptr_t i, opr_sz = simd_oprsz(desc);
2312     uint64_t *d = vd, *n = vn, *m = vm;
2313 
2314     for (i = 0; i < opr_sz / 8; ++i) {
2315         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2316     }
2317 }
2318 
2319 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2320 {
2321     intptr_t sel = H4(simd_data(desc));
2322     intptr_t i, opr_sz = simd_oprsz(desc);
2323     uint32_t *n = vn, *m = vm;
2324     uint64_t *d = vd;
2325 
2326     for (i = 0; i < opr_sz / 8; ++i) {
2327         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2328     }
2329 }
2330 #endif
2331 
2332 #define DO_CMP0(NAME, TYPE, OP)                         \
2333 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2334 {                                                       \
2335     intptr_t i, opr_sz = simd_oprsz(desc);              \
2336     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2337         TYPE nn = *(TYPE *)(vn + i);                    \
2338         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2339     }                                                   \
2340     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2341 }
2342 
2343 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2344 DO_CMP0(gvec_clt0_b, int8_t, <)
2345 DO_CMP0(gvec_cle0_b, int8_t, <=)
2346 DO_CMP0(gvec_cgt0_b, int8_t, >)
2347 DO_CMP0(gvec_cge0_b, int8_t, >=)
2348 
2349 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2350 DO_CMP0(gvec_clt0_h, int16_t, <)
2351 DO_CMP0(gvec_cle0_h, int16_t, <=)
2352 DO_CMP0(gvec_cgt0_h, int16_t, >)
2353 DO_CMP0(gvec_cge0_h, int16_t, >=)
2354 
2355 #undef DO_CMP0
2356 
2357 #define DO_ABD(NAME, TYPE)                                      \
2358 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2359 {                                                               \
2360     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2361     TYPE *d = vd, *n = vn, *m = vm;                             \
2362                                                                 \
2363     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2364         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2365     }                                                           \
2366     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2367 }
2368 
2369 DO_ABD(gvec_sabd_b, int8_t)
2370 DO_ABD(gvec_sabd_h, int16_t)
2371 DO_ABD(gvec_sabd_s, int32_t)
2372 DO_ABD(gvec_sabd_d, int64_t)
2373 
2374 DO_ABD(gvec_uabd_b, uint8_t)
2375 DO_ABD(gvec_uabd_h, uint16_t)
2376 DO_ABD(gvec_uabd_s, uint32_t)
2377 DO_ABD(gvec_uabd_d, uint64_t)
2378 
2379 #undef DO_ABD
2380 
2381 #define DO_ABA(NAME, TYPE)                                      \
2382 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2383 {                                                               \
2384     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2385     TYPE *d = vd, *n = vn, *m = vm;                             \
2386                                                                 \
2387     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2388         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2389     }                                                           \
2390     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2391 }
2392 
2393 DO_ABA(gvec_saba_b, int8_t)
2394 DO_ABA(gvec_saba_h, int16_t)
2395 DO_ABA(gvec_saba_s, int32_t)
2396 DO_ABA(gvec_saba_d, int64_t)
2397 
2398 DO_ABA(gvec_uaba_b, uint8_t)
2399 DO_ABA(gvec_uaba_h, uint16_t)
2400 DO_ABA(gvec_uaba_s, uint32_t)
2401 DO_ABA(gvec_uaba_d, uint64_t)
2402 
2403 #undef DO_ABA
2404 
2405 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2406 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2407                   float_status *stat, uint32_t desc)                       \
2408 {                                                                          \
2409     ARMVectorReg scratch;                                                  \
2410     intptr_t oprsz = simd_oprsz(desc);                                     \
2411     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2412     TYPE *d = vd, *n = vn, *m = vm;                                        \
2413     if (unlikely(d == m)) {                                                \
2414         m = memcpy(&scratch, m, oprsz);                                    \
2415     }                                                                      \
2416     for (intptr_t i = 0; i < half; ++i) {                                  \
2417         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2418     }                                                                      \
2419     for (intptr_t i = 0; i < half; ++i) {                                  \
2420         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2421     }                                                                      \
2422     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2423 }
2424 
2425 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2426 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2427 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2428 
2429 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2430 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2431 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2432 
2433 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2434 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2435 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2436 
2437 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2438 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2439 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2440 
2441 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2442 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2443 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2444 
2445 #undef DO_3OP_PAIR
2446 
2447 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2448 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2449 {                                                               \
2450     ARMVectorReg scratch;                                       \
2451     intptr_t oprsz = simd_oprsz(desc);                          \
2452     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2453     TYPE *d = vd, *n = vn, *m = vm;                             \
2454     if (unlikely(d == m)) {                                     \
2455         m = memcpy(&scratch, m, oprsz);                         \
2456     }                                                           \
2457     for (intptr_t i = 0; i < half; ++i) {                       \
2458         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2459     }                                                           \
2460     for (intptr_t i = 0; i < half; ++i) {                       \
2461         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2462     }                                                           \
2463     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2464 }
2465 
2466 #define ADD(A, B) (A + B)
2467 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2468 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2469 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2470 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2471 #undef  ADD
2472 
2473 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2474 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2475 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2476 
2477 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2478 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2479 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2480 
2481 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2482 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2483 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2484 
2485 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2486 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2487 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2488 
2489 #undef DO_3OP_PAIR
2490 
2491 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2492     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2493     {                                                                   \
2494         intptr_t i, oprsz = simd_oprsz(desc);                           \
2495         int shift = simd_data(desc);                                    \
2496         TYPE *d = vd, *n = vn;                                          \
2497         float_status *fpst = stat;                                      \
2498         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2499             d[i] = FUNC(n[i], shift, fpst);                             \
2500         }                                                               \
2501         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2502     }
2503 
2504 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2505 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2506 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2507 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2508 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2509 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2510 
2511 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2512 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2513 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2514 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2515 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2516 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2517 
2518 #undef DO_VCVT_FIXED
2519 
2520 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2521     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2522     {                                                                   \
2523         intptr_t i, oprsz = simd_oprsz(desc);                           \
2524         uint32_t rmode = simd_data(desc);                               \
2525         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2526         TYPE *d = vd, *n = vn;                                          \
2527         set_float_rounding_mode(rmode, fpst);                           \
2528         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2529             d[i] = FUNC(n[i], 0, fpst);                                 \
2530         }                                                               \
2531         set_float_rounding_mode(prev_rmode, fpst);                      \
2532         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2533     }
2534 
2535 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2536 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2537 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2538 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2539 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2540 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2541 
2542 #undef DO_VCVT_RMODE
2543 
2544 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2545     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2546     {                                                                   \
2547         intptr_t i, oprsz = simd_oprsz(desc);                           \
2548         uint32_t rmode = simd_data(desc);                               \
2549         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2550         TYPE *d = vd, *n = vn;                                          \
2551         set_float_rounding_mode(rmode, fpst);                           \
2552         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2553             d[i] = FUNC(n[i], fpst);                                    \
2554         }                                                               \
2555         set_float_rounding_mode(prev_rmode, fpst);                      \
2556         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2557     }
2558 
2559 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2560 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2561 
2562 #undef DO_VRINT_RMODE
2563 
2564 #ifdef TARGET_AARCH64
2565 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2566 {
2567     const uint8_t *indices = vm;
2568     CPUARMState *env = venv;
2569     size_t oprsz = simd_oprsz(desc);
2570     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2571     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2572     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2573     union {
2574         uint8_t b[16];
2575         uint64_t d[2];
2576     } result;
2577 
2578     /*
2579      * We must construct the final result in a temp, lest the output
2580      * overlaps the input table.  For TBL, begin with zero; for TBX,
2581      * begin with the original register contents.  Note that we always
2582      * copy 16 bytes here to avoid an extra branch; clearing the high
2583      * bits of the register for oprsz == 8 is handled below.
2584      */
2585     if (is_tbx) {
2586         memcpy(&result, vd, 16);
2587     } else {
2588         memset(&result, 0, 16);
2589     }
2590 
2591     for (size_t i = 0; i < oprsz; ++i) {
2592         uint32_t index = indices[H1(i)];
2593 
2594         if (index < table_len) {
2595             /*
2596              * Convert index (a byte offset into the virtual table
2597              * which is a series of 128-bit vectors concatenated)
2598              * into the correct register element, bearing in mind
2599              * that the table can wrap around from V31 to V0.
2600              */
2601             const uint8_t *table = (const uint8_t *)
2602                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2603             result.b[H1(i)] = table[H1(index % 16)];
2604         }
2605     }
2606 
2607     memcpy(vd, &result, 16);
2608     clear_tail(vd, oprsz, simd_maxsz(desc));
2609 }
2610 #endif
2611 
2612 /*
2613  * NxN -> N highpart multiply
2614  *
2615  * TODO: expose this as a generic vector operation.
2616  */
2617 
2618 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2619 {
2620     intptr_t i, opr_sz = simd_oprsz(desc);
2621     int8_t *d = vd, *n = vn, *m = vm;
2622 
2623     for (i = 0; i < opr_sz; ++i) {
2624         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2625     }
2626     clear_tail(d, opr_sz, simd_maxsz(desc));
2627 }
2628 
2629 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2630 {
2631     intptr_t i, opr_sz = simd_oprsz(desc);
2632     int16_t *d = vd, *n = vn, *m = vm;
2633 
2634     for (i = 0; i < opr_sz / 2; ++i) {
2635         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2636     }
2637     clear_tail(d, opr_sz, simd_maxsz(desc));
2638 }
2639 
2640 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2641 {
2642     intptr_t i, opr_sz = simd_oprsz(desc);
2643     int32_t *d = vd, *n = vn, *m = vm;
2644 
2645     for (i = 0; i < opr_sz / 4; ++i) {
2646         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2647     }
2648     clear_tail(d, opr_sz, simd_maxsz(desc));
2649 }
2650 
2651 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2652 {
2653     intptr_t i, opr_sz = simd_oprsz(desc);
2654     uint64_t *d = vd, *n = vn, *m = vm;
2655     uint64_t discard;
2656 
2657     for (i = 0; i < opr_sz / 8; ++i) {
2658         muls64(&discard, &d[i], n[i], m[i]);
2659     }
2660     clear_tail(d, opr_sz, simd_maxsz(desc));
2661 }
2662 
2663 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2664 {
2665     intptr_t i, opr_sz = simd_oprsz(desc);
2666     uint8_t *d = vd, *n = vn, *m = vm;
2667 
2668     for (i = 0; i < opr_sz; ++i) {
2669         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2670     }
2671     clear_tail(d, opr_sz, simd_maxsz(desc));
2672 }
2673 
2674 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2675 {
2676     intptr_t i, opr_sz = simd_oprsz(desc);
2677     uint16_t *d = vd, *n = vn, *m = vm;
2678 
2679     for (i = 0; i < opr_sz / 2; ++i) {
2680         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2681     }
2682     clear_tail(d, opr_sz, simd_maxsz(desc));
2683 }
2684 
2685 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2686 {
2687     intptr_t i, opr_sz = simd_oprsz(desc);
2688     uint32_t *d = vd, *n = vn, *m = vm;
2689 
2690     for (i = 0; i < opr_sz / 4; ++i) {
2691         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2692     }
2693     clear_tail(d, opr_sz, simd_maxsz(desc));
2694 }
2695 
2696 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2697 {
2698     intptr_t i, opr_sz = simd_oprsz(desc);
2699     uint64_t *d = vd, *n = vn, *m = vm;
2700     uint64_t discard;
2701 
2702     for (i = 0; i < opr_sz / 8; ++i) {
2703         mulu64(&discard, &d[i], n[i], m[i]);
2704     }
2705     clear_tail(d, opr_sz, simd_maxsz(desc));
2706 }
2707 
2708 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2709 {
2710     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2711     int shr = simd_data(desc);
2712     uint64_t *d = vd, *n = vn, *m = vm;
2713 
2714     for (i = 0; i < opr_sz; ++i) {
2715         d[i] = ror64(n[i] ^ m[i], shr);
2716     }
2717     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2718 }
2719 
2720 /*
2721  * Integer matrix-multiply accumulate
2722  */
2723 
2724 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2725 {
2726     int8_t *n = vn, *m = vm;
2727 
2728     for (intptr_t k = 0; k < 8; ++k) {
2729         sum += n[H1(k)] * m[H1(k)];
2730     }
2731     return sum;
2732 }
2733 
2734 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2735 {
2736     uint8_t *n = vn, *m = vm;
2737 
2738     for (intptr_t k = 0; k < 8; ++k) {
2739         sum += n[H1(k)] * m[H1(k)];
2740     }
2741     return sum;
2742 }
2743 
2744 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2745 {
2746     uint8_t *n = vn;
2747     int8_t *m = vm;
2748 
2749     for (intptr_t k = 0; k < 8; ++k) {
2750         sum += n[H1(k)] * m[H1(k)];
2751     }
2752     return sum;
2753 }
2754 
2755 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2756                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2757 {
2758     intptr_t seg, opr_sz = simd_oprsz(desc);
2759 
2760     for (seg = 0; seg < opr_sz; seg += 16) {
2761         uint32_t *d = vd + seg;
2762         uint32_t *a = va + seg;
2763         uint32_t sum0, sum1, sum2, sum3;
2764 
2765         /*
2766          * Process the entire segment at once, writing back the
2767          * results only after we've consumed all of the inputs.
2768          *
2769          * Key to indices by column:
2770          *          i   j                  i             j
2771          */
2772         sum0 = a[H4(0 + 0)];
2773         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2774         sum1 = a[H4(0 + 1)];
2775         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2776         sum2 = a[H4(2 + 0)];
2777         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2778         sum3 = a[H4(2 + 1)];
2779         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2780 
2781         d[H4(0)] = sum0;
2782         d[H4(1)] = sum1;
2783         d[H4(2)] = sum2;
2784         d[H4(3)] = sum3;
2785     }
2786     clear_tail(vd, opr_sz, simd_maxsz(desc));
2787 }
2788 
2789 #define DO_MMLA_B(NAME, INNER) \
2790     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2791     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2792 
2793 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2794 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2795 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2796 
2797 /*
2798  * BFloat16 Dot Product
2799  */
2800 
2801 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2802 {
2803     /*
2804      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2805      * For EBF = 0, we ignore the FPCR bits which determine rounding
2806      * mode and denormal-flushing, and we do unfused multiplies and
2807      * additions with intermediate rounding of all products and sums.
2808      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2809      * and we perform a fused two-way sum-of-products without intermediate
2810      * rounding of the products.
2811      * In either case, we don't set fp exception flags.
2812      *
2813      * EBF is AArch64 only, so even if it's set in the FPCR it has
2814      * no effect on AArch32 instructions.
2815      */
2816     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2817 
2818     *statusp = env->vfp.fp_status;
2819     set_default_nan_mode(true, statusp);
2820 
2821     if (ebf) {
2822         /* EBF=1 needs to do a step with round-to-odd semantics */
2823         *oddstatusp = *statusp;
2824         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2825     } else {
2826         set_flush_to_zero(true, statusp);
2827         set_flush_inputs_to_zero(true, statusp);
2828         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2829     }
2830     return ebf;
2831 }
2832 
2833 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2834 {
2835     float32 t1, t2;
2836 
2837     /*
2838      * Extract each BFloat16 from the element pair, and shift
2839      * them such that they become float32.
2840      */
2841     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2842     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2843     t1 = float32_add(t1, t2, fpst);
2844     t1 = float32_add(sum, t1, fpst);
2845 
2846     return t1;
2847 }
2848 
2849 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2850                      float_status *fpst, float_status *fpst_odd)
2851 {
2852     /*
2853      * Compare f16_dotadd() in sme_helper.c, but here we have
2854      * bfloat16 inputs. In particular that means that we do not
2855      * want the FPCR.FZ16 flush semantics, so we use the normal
2856      * float_status for the input handling here.
2857      */
2858     float64 e1r = float32_to_float64(e1 << 16, fpst);
2859     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2860     float64 e2r = float32_to_float64(e2 << 16, fpst);
2861     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2862     float64 t64;
2863     float32 t32;
2864 
2865     /*
2866      * The ARM pseudocode function FPDot performs both multiplies
2867      * and the add with a single rounding operation.  Emulate this
2868      * by performing the first multiply in round-to-odd, then doing
2869      * the second multiply as fused multiply-add, and rounding to
2870      * float32 all in one step.
2871      */
2872     t64 = float64_mul(e1r, e2r, fpst_odd);
2873     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2874 
2875     /* This conversion is exact, because we've already rounded. */
2876     t32 = float64_to_float32(t64, fpst);
2877 
2878     /* The final accumulation step is not fused. */
2879     return float32_add(sum, t32, fpst);
2880 }
2881 
2882 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2883                         CPUARMState *env, uint32_t desc)
2884 {
2885     intptr_t i, opr_sz = simd_oprsz(desc);
2886     float32 *d = vd, *a = va;
2887     uint32_t *n = vn, *m = vm;
2888     float_status fpst, fpst_odd;
2889 
2890     if (is_ebf(env, &fpst, &fpst_odd)) {
2891         for (i = 0; i < opr_sz / 4; ++i) {
2892             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2893         }
2894     } else {
2895         for (i = 0; i < opr_sz / 4; ++i) {
2896             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2897         }
2898     }
2899     clear_tail(d, opr_sz, simd_maxsz(desc));
2900 }
2901 
2902 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2903                             void *va, CPUARMState *env, uint32_t desc)
2904 {
2905     intptr_t i, j, opr_sz = simd_oprsz(desc);
2906     intptr_t index = simd_data(desc);
2907     intptr_t elements = opr_sz / 4;
2908     intptr_t eltspersegment = MIN(16 / 4, elements);
2909     float32 *d = vd, *a = va;
2910     uint32_t *n = vn, *m = vm;
2911     float_status fpst, fpst_odd;
2912 
2913     if (is_ebf(env, &fpst, &fpst_odd)) {
2914         for (i = 0; i < elements; i += eltspersegment) {
2915             uint32_t m_idx = m[i + H4(index)];
2916 
2917             for (j = i; j < i + eltspersegment; j++) {
2918                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2919             }
2920         }
2921     } else {
2922         for (i = 0; i < elements; i += eltspersegment) {
2923             uint32_t m_idx = m[i + H4(index)];
2924 
2925             for (j = i; j < i + eltspersegment; j++) {
2926                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2927             }
2928         }
2929     }
2930     clear_tail(d, opr_sz, simd_maxsz(desc));
2931 }
2932 
2933 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2934                          CPUARMState *env, uint32_t desc)
2935 {
2936     intptr_t s, opr_sz = simd_oprsz(desc);
2937     float32 *d = vd, *a = va;
2938     uint32_t *n = vn, *m = vm;
2939     float_status fpst, fpst_odd;
2940 
2941     if (is_ebf(env, &fpst, &fpst_odd)) {
2942         for (s = 0; s < opr_sz / 4; s += 4) {
2943             float32 sum00, sum01, sum10, sum11;
2944 
2945             /*
2946              * Process the entire segment at once, writing back the
2947              * results only after we've consumed all of the inputs.
2948              *
2949              * Key to indices by column:
2950              *               i   j               i   k             j   k
2951              */
2952             sum00 = a[s + H4(0 + 0)];
2953             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2954             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2955 
2956             sum01 = a[s + H4(0 + 1)];
2957             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2958             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2959 
2960             sum10 = a[s + H4(2 + 0)];
2961             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2962             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2963 
2964             sum11 = a[s + H4(2 + 1)];
2965             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2966             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2967 
2968             d[s + H4(0 + 0)] = sum00;
2969             d[s + H4(0 + 1)] = sum01;
2970             d[s + H4(2 + 0)] = sum10;
2971             d[s + H4(2 + 1)] = sum11;
2972         }
2973     } else {
2974         for (s = 0; s < opr_sz / 4; s += 4) {
2975             float32 sum00, sum01, sum10, sum11;
2976 
2977             /*
2978              * Process the entire segment at once, writing back the
2979              * results only after we've consumed all of the inputs.
2980              *
2981              * Key to indices by column:
2982              *               i   j           i   k             j   k
2983              */
2984             sum00 = a[s + H4(0 + 0)];
2985             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2986             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2987 
2988             sum01 = a[s + H4(0 + 1)];
2989             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2990             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2991 
2992             sum10 = a[s + H4(2 + 0)];
2993             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2994             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
2995 
2996             sum11 = a[s + H4(2 + 1)];
2997             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
2998             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
2999 
3000             d[s + H4(0 + 0)] = sum00;
3001             d[s + H4(0 + 1)] = sum01;
3002             d[s + H4(2 + 0)] = sum10;
3003             d[s + H4(2 + 1)] = sum11;
3004         }
3005     }
3006     clear_tail(d, opr_sz, simd_maxsz(desc));
3007 }
3008 
3009 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3010                          float_status *stat, uint32_t desc)
3011 {
3012     intptr_t i, opr_sz = simd_oprsz(desc);
3013     intptr_t sel = simd_data(desc);
3014     float32 *d = vd, *a = va;
3015     bfloat16 *n = vn, *m = vm;
3016 
3017     for (i = 0; i < opr_sz / 4; ++i) {
3018         float32 nn = n[H2(i * 2 + sel)] << 16;
3019         float32 mm = m[H2(i * 2 + sel)] << 16;
3020         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3021     }
3022     clear_tail(d, opr_sz, simd_maxsz(desc));
3023 }
3024 
3025 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3026                              void *va, float_status *stat, uint32_t desc)
3027 {
3028     intptr_t i, j, opr_sz = simd_oprsz(desc);
3029     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3030     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3031     intptr_t elements = opr_sz / 4;
3032     intptr_t eltspersegment = MIN(16 / 4, elements);
3033     float32 *d = vd, *a = va;
3034     bfloat16 *n = vn, *m = vm;
3035 
3036     for (i = 0; i < elements; i += eltspersegment) {
3037         float32 m_idx = m[H2(2 * i + index)] << 16;
3038 
3039         for (j = i; j < i + eltspersegment; j++) {
3040             float32 n_j = n[H2(2 * j + sel)] << 16;
3041             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3042         }
3043     }
3044     clear_tail(d, opr_sz, simd_maxsz(desc));
3045 }
3046 
3047 #define DO_CLAMP(NAME, TYPE) \
3048 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3049 {                                                                       \
3050     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3051     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3052         TYPE aa = *(TYPE *)(a + i);                                     \
3053         TYPE nn = *(TYPE *)(n + i);                                     \
3054         TYPE mm = *(TYPE *)(m + i);                                     \
3055         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3056         *(TYPE *)(d + i) = dd;                                          \
3057     }                                                                   \
3058     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3059 }
3060 
3061 DO_CLAMP(gvec_sclamp_b, int8_t)
3062 DO_CLAMP(gvec_sclamp_h, int16_t)
3063 DO_CLAMP(gvec_sclamp_s, int32_t)
3064 DO_CLAMP(gvec_sclamp_d, int64_t)
3065 
3066 DO_CLAMP(gvec_uclamp_b, uint8_t)
3067 DO_CLAMP(gvec_uclamp_h, uint16_t)
3068 DO_CLAMP(gvec_uclamp_s, uint32_t)
3069 DO_CLAMP(gvec_uclamp_d, uint64_t)
3070 
3071 /* Bit count in each 8-bit word. */
3072 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3073 {
3074     intptr_t i, opr_sz = simd_oprsz(desc);
3075     uint8_t *d = vd, *n = vn;
3076 
3077     for (i = 0; i < opr_sz; ++i) {
3078         d[i] = ctpop8(n[i]);
3079     }
3080     clear_tail(d, opr_sz, simd_maxsz(desc));
3081 }
3082 
3083 /* Reverse bits in each 8 bit word */
3084 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3085 {
3086     intptr_t i, opr_sz = simd_oprsz(desc);
3087     uint64_t *d = vd, *n = vn;
3088 
3089     for (i = 0; i < opr_sz / 8; ++i) {
3090         d[i] = revbit64(bswap64(n[i]));
3091     }
3092     clear_tail(d, opr_sz, simd_maxsz(desc));
3093 }
3094 
3095 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3096 {
3097     intptr_t i, opr_sz = simd_oprsz(desc);
3098     uint32_t *d = vd, *n = vn;
3099 
3100     for (i = 0; i < opr_sz / 4; ++i) {
3101         d[i] = helper_recpe_u32(n[i]);
3102     }
3103     clear_tail(d, opr_sz, simd_maxsz(desc));
3104 }
3105 
3106 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3107 {
3108     intptr_t i, opr_sz = simd_oprsz(desc);
3109     uint32_t *d = vd, *n = vn;
3110 
3111     for (i = 0; i < opr_sz / 4; ++i) {
3112         d[i] = helper_rsqrte_u32(n[i]);
3113     }
3114     clear_tail(d, opr_sz, simd_maxsz(desc));
3115 }
3116