xref: /qemu/target/arm/tcg/vec_helper.c (revision 9a93223c86c6721a9c868085dae28698852bb8d2)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          void *vfpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     float_status *fpst = vfpst;
883     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
884     uint32_t neg_imag = neg_real ^ 1;
885     uintptr_t i;
886 
887     /* Shift boolean to the sign bit so we can xor to negate.  */
888     neg_real <<= 15;
889     neg_imag <<= 15;
890 
891     for (i = 0; i < opr_sz / 2; i += 2) {
892         float16 e0 = n[H2(i)];
893         float16 e1 = m[H2(i + 1)] ^ neg_imag;
894         float16 e2 = n[H2(i + 1)];
895         float16 e3 = m[H2(i)] ^ neg_real;
896 
897         d[H2(i)] = float16_add(e0, e1, fpst);
898         d[H2(i + 1)] = float16_add(e2, e3, fpst);
899     }
900     clear_tail(d, opr_sz, simd_maxsz(desc));
901 }
902 
903 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
904                          void *vfpst, uint32_t desc)
905 {
906     uintptr_t opr_sz = simd_oprsz(desc);
907     float32 *d = vd;
908     float32 *n = vn;
909     float32 *m = vm;
910     float_status *fpst = vfpst;
911     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
912     uint32_t neg_imag = neg_real ^ 1;
913     uintptr_t i;
914 
915     /* Shift boolean to the sign bit so we can xor to negate.  */
916     neg_real <<= 31;
917     neg_imag <<= 31;
918 
919     for (i = 0; i < opr_sz / 4; i += 2) {
920         float32 e0 = n[H4(i)];
921         float32 e1 = m[H4(i + 1)] ^ neg_imag;
922         float32 e2 = n[H4(i + 1)];
923         float32 e3 = m[H4(i)] ^ neg_real;
924 
925         d[H4(i)] = float32_add(e0, e1, fpst);
926         d[H4(i + 1)] = float32_add(e2, e3, fpst);
927     }
928     clear_tail(d, opr_sz, simd_maxsz(desc));
929 }
930 
931 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
932                          void *vfpst, uint32_t desc)
933 {
934     uintptr_t opr_sz = simd_oprsz(desc);
935     float64 *d = vd;
936     float64 *n = vn;
937     float64 *m = vm;
938     float_status *fpst = vfpst;
939     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
940     uint64_t neg_imag = neg_real ^ 1;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e0 = n[i];
949         float64 e1 = m[i + 1] ^ neg_imag;
950         float64 e2 = n[i + 1];
951         float64 e3 = m[i] ^ neg_real;
952 
953         d[i] = float64_add(e0, e1, fpst);
954         d[i + 1] = float64_add(e2, e3, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
960                          void *vfpst, uint32_t desc)
961 {
962     uintptr_t opr_sz = simd_oprsz(desc);
963     float16 *d = vd, *n = vn, *m = vm, *a = va;
964     float_status *fpst = vfpst;
965     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
966     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
967     uint32_t neg_real = flip ^ neg_imag;
968     uintptr_t i;
969 
970     /* Shift boolean to the sign bit so we can xor to negate.  */
971     neg_real <<= 15;
972     neg_imag <<= 15;
973 
974     for (i = 0; i < opr_sz / 2; i += 2) {
975         float16 e2 = n[H2(i + flip)];
976         float16 e1 = m[H2(i + flip)] ^ neg_real;
977         float16 e4 = e2;
978         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
979 
980         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
981         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
982     }
983     clear_tail(d, opr_sz, simd_maxsz(desc));
984 }
985 
986 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
987                              void *vfpst, uint32_t desc)
988 {
989     uintptr_t opr_sz = simd_oprsz(desc);
990     float16 *d = vd, *n = vn, *m = vm, *a = va;
991     float_status *fpst = vfpst;
992     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
993     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
994     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
995     uint32_t neg_real = flip ^ neg_imag;
996     intptr_t elements = opr_sz / sizeof(float16);
997     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
998     intptr_t i, j;
999 
1000     /* Shift boolean to the sign bit so we can xor to negate.  */
1001     neg_real <<= 15;
1002     neg_imag <<= 15;
1003 
1004     for (i = 0; i < elements; i += eltspersegment) {
1005         float16 mr = m[H2(i + 2 * index + 0)];
1006         float16 mi = m[H2(i + 2 * index + 1)];
1007         float16 e1 = neg_real ^ (flip ? mi : mr);
1008         float16 e3 = neg_imag ^ (flip ? mr : mi);
1009 
1010         for (j = i; j < i + eltspersegment; j += 2) {
1011             float16 e2 = n[H2(j + flip)];
1012             float16 e4 = e2;
1013 
1014             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1015             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1016         }
1017     }
1018     clear_tail(d, opr_sz, simd_maxsz(desc));
1019 }
1020 
1021 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1022                          void *vfpst, uint32_t desc)
1023 {
1024     uintptr_t opr_sz = simd_oprsz(desc);
1025     float32 *d = vd, *n = vn, *m = vm, *a = va;
1026     float_status *fpst = vfpst;
1027     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1028     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1029     uint32_t neg_real = flip ^ neg_imag;
1030     uintptr_t i;
1031 
1032     /* Shift boolean to the sign bit so we can xor to negate.  */
1033     neg_real <<= 31;
1034     neg_imag <<= 31;
1035 
1036     for (i = 0; i < opr_sz / 4; i += 2) {
1037         float32 e2 = n[H4(i + flip)];
1038         float32 e1 = m[H4(i + flip)] ^ neg_real;
1039         float32 e4 = e2;
1040         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1041 
1042         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1043         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1044     }
1045     clear_tail(d, opr_sz, simd_maxsz(desc));
1046 }
1047 
1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1049                              void *vfpst, uint32_t desc)
1050 {
1051     uintptr_t opr_sz = simd_oprsz(desc);
1052     float32 *d = vd, *n = vn, *m = vm, *a = va;
1053     float_status *fpst = vfpst;
1054     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1055     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1056     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1057     uint32_t neg_real = flip ^ neg_imag;
1058     intptr_t elements = opr_sz / sizeof(float32);
1059     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1060     intptr_t i, j;
1061 
1062     /* Shift boolean to the sign bit so we can xor to negate.  */
1063     neg_real <<= 31;
1064     neg_imag <<= 31;
1065 
1066     for (i = 0; i < elements; i += eltspersegment) {
1067         float32 mr = m[H4(i + 2 * index + 0)];
1068         float32 mi = m[H4(i + 2 * index + 1)];
1069         float32 e1 = neg_real ^ (flip ? mi : mr);
1070         float32 e3 = neg_imag ^ (flip ? mr : mi);
1071 
1072         for (j = i; j < i + eltspersegment; j += 2) {
1073             float32 e2 = n[H4(j + flip)];
1074             float32 e4 = e2;
1075 
1076             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1077             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1078         }
1079     }
1080     clear_tail(d, opr_sz, simd_maxsz(desc));
1081 }
1082 
1083 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1084                          void *vfpst, uint32_t desc)
1085 {
1086     uintptr_t opr_sz = simd_oprsz(desc);
1087     float64 *d = vd, *n = vn, *m = vm, *a = va;
1088     float_status *fpst = vfpst;
1089     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1090     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1091     uint64_t neg_real = flip ^ neg_imag;
1092     uintptr_t i;
1093 
1094     /* Shift boolean to the sign bit so we can xor to negate.  */
1095     neg_real <<= 63;
1096     neg_imag <<= 63;
1097 
1098     for (i = 0; i < opr_sz / 8; i += 2) {
1099         float64 e2 = n[i + flip];
1100         float64 e1 = m[i + flip] ^ neg_real;
1101         float64 e4 = e2;
1102         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1103 
1104         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1105         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1106     }
1107     clear_tail(d, opr_sz, simd_maxsz(desc));
1108 }
1109 
1110 /*
1111  * Floating point comparisons producing an integer result (all 1s or all 0s).
1112  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1113  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1114  */
1115 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1116 {
1117     return -float16_eq_quiet(op1, op2, stat);
1118 }
1119 
1120 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1121 {
1122     return -float32_eq_quiet(op1, op2, stat);
1123 }
1124 
1125 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1126 {
1127     return -float64_eq_quiet(op1, op2, stat);
1128 }
1129 
1130 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1131 {
1132     return -float16_le(op2, op1, stat);
1133 }
1134 
1135 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1136 {
1137     return -float32_le(op2, op1, stat);
1138 }
1139 
1140 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1141 {
1142     return -float64_le(op2, op1, stat);
1143 }
1144 
1145 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1146 {
1147     return -float16_lt(op2, op1, stat);
1148 }
1149 
1150 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1151 {
1152     return -float32_lt(op2, op1, stat);
1153 }
1154 
1155 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1156 {
1157     return -float64_lt(op2, op1, stat);
1158 }
1159 
1160 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1161 {
1162     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1163 }
1164 
1165 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1166 {
1167     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1168 }
1169 
1170 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1171 {
1172     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1173 }
1174 
1175 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1176 {
1177     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1178 }
1179 
1180 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1181 {
1182     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1183 }
1184 
1185 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1186 {
1187     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1188 }
1189 
1190 static int16_t vfp_tosszh(float16 x, void *fpstp)
1191 {
1192     float_status *fpst = fpstp;
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_int16_round_to_zero(x, fpst);
1198 }
1199 
1200 static uint16_t vfp_touszh(float16 x, void *fpstp)
1201 {
1202     float_status *fpst = fpstp;
1203     if (float16_is_any_nan(x)) {
1204         float_raise(float_flag_invalid, fpst);
1205         return 0;
1206     }
1207     return float16_to_uint16_round_to_zero(x, fpst);
1208 }
1209 
1210 #define DO_2OP(NAME, FUNC, TYPE) \
1211 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1212 {                                                                 \
1213     intptr_t i, oprsz = simd_oprsz(desc);                         \
1214     TYPE *d = vd, *n = vn;                                        \
1215     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1216         d[i] = FUNC(n[i], stat);                                  \
1217     }                                                             \
1218     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1219 }
1220 
1221 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1222 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1223 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1224 
1225 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1226 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1227 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1228 
1229 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1230 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1231 
1232 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1233 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1234 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1235 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1236 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1237 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1238 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1239 DO_2OP(gvec_touszh, vfp_touszh, float16)
1240 
1241 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1242     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1243     {                                                           \
1244         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1245     }
1246 
1247 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1248     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1249     {                                                           \
1250         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1251     }
1252 
1253 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1254     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1255     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1256     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1257     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1258 
1259 DO_2OP_CMP0(cgt, cgt, FWD)
1260 DO_2OP_CMP0(cge, cge, FWD)
1261 DO_2OP_CMP0(ceq, ceq, FWD)
1262 DO_2OP_CMP0(clt, cgt, REV)
1263 DO_2OP_CMP0(cle, cge, REV)
1264 
1265 #undef DO_2OP
1266 #undef DO_2OP_CMP0
1267 
1268 /* Floating-point trigonometric starting value.
1269  * See the ARM ARM pseudocode function FPTrigSMul.
1270  */
1271 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1272 {
1273     float16 result = float16_mul(op1, op1, stat);
1274     if (!float16_is_any_nan(result)) {
1275         result = float16_set_sign(result, op2 & 1);
1276     }
1277     return result;
1278 }
1279 
1280 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1281 {
1282     float32 result = float32_mul(op1, op1, stat);
1283     if (!float32_is_any_nan(result)) {
1284         result = float32_set_sign(result, op2 & 1);
1285     }
1286     return result;
1287 }
1288 
1289 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1290 {
1291     float64 result = float64_mul(op1, op1, stat);
1292     if (!float64_is_any_nan(result)) {
1293         result = float64_set_sign(result, op2 & 1);
1294     }
1295     return result;
1296 }
1297 
1298 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1299 {
1300     return float16_abs(float16_sub(op1, op2, stat));
1301 }
1302 
1303 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1304 {
1305     return float32_abs(float32_sub(op1, op2, stat));
1306 }
1307 
1308 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1309 {
1310     return float64_abs(float64_sub(op1, op2, stat));
1311 }
1312 
1313 /*
1314  * Reciprocal step. These are the AArch32 version which uses a
1315  * non-fused multiply-and-subtract.
1316  */
1317 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1318 {
1319     op1 = float16_squash_input_denormal(op1, stat);
1320     op2 = float16_squash_input_denormal(op2, stat);
1321 
1322     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1323         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1324         return float16_two;
1325     }
1326     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1327 }
1328 
1329 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1330 {
1331     op1 = float32_squash_input_denormal(op1, stat);
1332     op2 = float32_squash_input_denormal(op2, stat);
1333 
1334     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1335         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1336         return float32_two;
1337     }
1338     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1339 }
1340 
1341 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1342 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1343 {
1344     op1 = float16_squash_input_denormal(op1, stat);
1345     op2 = float16_squash_input_denormal(op2, stat);
1346 
1347     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1348         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1349         return float16_one_point_five;
1350     }
1351     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1352     return float16_div(op1, float16_two, stat);
1353 }
1354 
1355 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1356 {
1357     op1 = float32_squash_input_denormal(op1, stat);
1358     op2 = float32_squash_input_denormal(op2, stat);
1359 
1360     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1361         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1362         return float32_one_point_five;
1363     }
1364     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1365     return float32_div(op1, float32_two, stat);
1366 }
1367 
1368 #define DO_3OP(NAME, FUNC, TYPE) \
1369 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1370 {                                                                          \
1371     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1372     TYPE *d = vd, *n = vn, *m = vm;                                        \
1373     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1374         d[i] = FUNC(n[i], m[i], stat);                                     \
1375     }                                                                      \
1376     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1377 }
1378 
1379 DO_3OP(gvec_fadd_h, float16_add, float16)
1380 DO_3OP(gvec_fadd_s, float32_add, float32)
1381 DO_3OP(gvec_fadd_d, float64_add, float64)
1382 
1383 DO_3OP(gvec_fsub_h, float16_sub, float16)
1384 DO_3OP(gvec_fsub_s, float32_sub, float32)
1385 DO_3OP(gvec_fsub_d, float64_sub, float64)
1386 
1387 DO_3OP(gvec_fmul_h, float16_mul, float16)
1388 DO_3OP(gvec_fmul_s, float32_mul, float32)
1389 DO_3OP(gvec_fmul_d, float64_mul, float64)
1390 
1391 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1392 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1393 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1394 
1395 DO_3OP(gvec_fabd_h, float16_abd, float16)
1396 DO_3OP(gvec_fabd_s, float32_abd, float32)
1397 DO_3OP(gvec_fabd_d, float64_abd, float64)
1398 
1399 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1400 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1401 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1402 
1403 DO_3OP(gvec_fcge_h, float16_cge, float16)
1404 DO_3OP(gvec_fcge_s, float32_cge, float32)
1405 DO_3OP(gvec_fcge_d, float64_cge, float64)
1406 
1407 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1408 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1409 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1410 
1411 DO_3OP(gvec_facge_h, float16_acge, float16)
1412 DO_3OP(gvec_facge_s, float32_acge, float32)
1413 DO_3OP(gvec_facge_d, float64_acge, float64)
1414 
1415 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1416 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1417 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1418 
1419 DO_3OP(gvec_fmax_h, float16_max, float16)
1420 DO_3OP(gvec_fmax_s, float32_max, float32)
1421 DO_3OP(gvec_fmax_d, float64_max, float64)
1422 
1423 DO_3OP(gvec_fmin_h, float16_min, float16)
1424 DO_3OP(gvec_fmin_s, float32_min, float32)
1425 DO_3OP(gvec_fmin_d, float64_min, float64)
1426 
1427 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1428 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1429 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1430 
1431 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1432 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1433 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1434 
1435 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1436 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1437 
1438 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1439 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1440 
1441 #ifdef TARGET_AARCH64
1442 DO_3OP(gvec_fdiv_h, float16_div, float16)
1443 DO_3OP(gvec_fdiv_s, float32_div, float32)
1444 DO_3OP(gvec_fdiv_d, float64_div, float64)
1445 
1446 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1447 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1448 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1449 
1450 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1451 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1452 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1453 
1454 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1455 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1456 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1457 
1458 #endif
1459 #undef DO_3OP
1460 
1461 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1462 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1463                                  float_status *stat)
1464 {
1465     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1466 }
1467 
1468 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1469                                  float_status *stat)
1470 {
1471     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1472 }
1473 
1474 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1475                                  float_status *stat)
1476 {
1477     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1478 }
1479 
1480 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1481                                  float_status *stat)
1482 {
1483     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1484 }
1485 
1486 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1487 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1488                                 float_status *stat)
1489 {
1490     return float16_muladd(op1, op2, dest, 0, stat);
1491 }
1492 
1493 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1494                                  float_status *stat)
1495 {
1496     return float32_muladd(op1, op2, dest, 0, stat);
1497 }
1498 
1499 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1500                                  float_status *stat)
1501 {
1502     return float64_muladd(op1, op2, dest, 0, stat);
1503 }
1504 
1505 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1506                                  float_status *stat)
1507 {
1508     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1509 }
1510 
1511 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1512                                  float_status *stat)
1513 {
1514     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1515 }
1516 
1517 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1518                                  float_status *stat)
1519 {
1520     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1521 }
1522 
1523 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1524 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1525 {                                                                          \
1526     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1527     TYPE *d = vd, *n = vn, *m = vm;                                        \
1528     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1529         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1530     }                                                                      \
1531     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1532 }
1533 
1534 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1535 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1536 
1537 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1538 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1539 
1540 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1541 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1542 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1543 
1544 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1545 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1546 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1547 
1548 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1549  * For AdvSIMD, there is of course only one such vector segment.
1550  */
1551 
1552 #define DO_MUL_IDX(NAME, TYPE, H) \
1553 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1554 {                                                                          \
1555     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1556     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1557     intptr_t idx = simd_data(desc);                                        \
1558     TYPE *d = vd, *n = vn, *m = vm;                                        \
1559     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1560         TYPE mm = m[H(i + idx)];                                           \
1561         for (j = 0; j < segment; j++) {                                    \
1562             d[i + j] = n[i + j] * mm;                                      \
1563         }                                                                  \
1564     }                                                                      \
1565     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1566 }
1567 
1568 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1569 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1570 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1571 
1572 #undef DO_MUL_IDX
1573 
1574 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1575 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1576 {                                                                          \
1577     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1578     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1579     intptr_t idx = simd_data(desc);                                        \
1580     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1581     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1582         TYPE mm = m[H(i + idx)];                                           \
1583         for (j = 0; j < segment; j++) {                                    \
1584             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1585         }                                                                  \
1586     }                                                                      \
1587     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1588 }
1589 
1590 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1591 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1592 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1593 
1594 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1595 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1596 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1597 
1598 #undef DO_MLA_IDX
1599 
1600 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1601 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1602 {                                                                          \
1603     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1604     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1605     intptr_t idx = simd_data(desc);                                        \
1606     TYPE *d = vd, *n = vn, *m = vm;                                        \
1607     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1608         TYPE mm = m[H(i + idx)];                                           \
1609         for (j = 0; j < segment; j++) {                                    \
1610             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1611         }                                                                  \
1612     }                                                                      \
1613     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1614 }
1615 
1616 #define nop(N, M, S) (M)
1617 
1618 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1619 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1620 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1621 
1622 #ifdef TARGET_AARCH64
1623 
1624 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1625 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1626 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1627 
1628 #endif
1629 
1630 #undef nop
1631 
1632 /*
1633  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1634  * the fused ops below they assume accumulate both from and into Vd.
1635  */
1636 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1637 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1638 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1639 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1640 
1641 #undef DO_FMUL_IDX
1642 
1643 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1644 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1645                   void *stat, uint32_t desc)                               \
1646 {                                                                          \
1647     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1648     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1649     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1650     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1651     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1652     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1653     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1654         TYPE mm = m[H(i + idx)];                                           \
1655         for (j = 0; j < segment; j++) {                                    \
1656             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1657                                      mm, a[i + j], 0, stat);               \
1658         }                                                                  \
1659     }                                                                      \
1660     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1661 }
1662 
1663 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1664 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1665 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1666 
1667 #undef DO_FMLA_IDX
1668 
1669 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1670 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1671 {                                                                          \
1672     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1673     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1674     bool q = false;                                                        \
1675     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1676         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1677         if (dd < MIN) {                                                    \
1678             dd = MIN;                                                      \
1679             q = true;                                                      \
1680         } else if (dd > MAX) {                                             \
1681             dd = MAX;                                                      \
1682             q = true;                                                      \
1683         }                                                                  \
1684         d[i] = dd;                                                         \
1685     }                                                                      \
1686     if (q) {                                                               \
1687         uint32_t *qc = vq;                                                 \
1688         qc[0] = 1;                                                         \
1689     }                                                                      \
1690     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1691 }
1692 
1693 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1694 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1695 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1696 
1697 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1698 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1699 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1700 
1701 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1702 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1703 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1704 
1705 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1706 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1707 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1708 
1709 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1710 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1711 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1712 
1713 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1714 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1715 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1716 
1717 #undef DO_SAT
1718 
1719 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1720                           void *vm, uint32_t desc)
1721 {
1722     intptr_t i, oprsz = simd_oprsz(desc);
1723     uint64_t *d = vd, *n = vn, *m = vm;
1724     bool q = false;
1725 
1726     for (i = 0; i < oprsz / 8; i++) {
1727         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1728         if (dd < nn) {
1729             dd = UINT64_MAX;
1730             q = true;
1731         }
1732         d[i] = dd;
1733     }
1734     if (q) {
1735         uint32_t *qc = vq;
1736         qc[0] = 1;
1737     }
1738     clear_tail(d, oprsz, simd_maxsz(desc));
1739 }
1740 
1741 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1742                           void *vm, uint32_t desc)
1743 {
1744     intptr_t i, oprsz = simd_oprsz(desc);
1745     uint64_t *d = vd, *n = vn, *m = vm;
1746     bool q = false;
1747 
1748     for (i = 0; i < oprsz / 8; i++) {
1749         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1750         if (nn < mm) {
1751             dd = 0;
1752             q = true;
1753         }
1754         d[i] = dd;
1755     }
1756     if (q) {
1757         uint32_t *qc = vq;
1758         qc[0] = 1;
1759     }
1760     clear_tail(d, oprsz, simd_maxsz(desc));
1761 }
1762 
1763 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1764                           void *vm, uint32_t desc)
1765 {
1766     intptr_t i, oprsz = simd_oprsz(desc);
1767     int64_t *d = vd, *n = vn, *m = vm;
1768     bool q = false;
1769 
1770     for (i = 0; i < oprsz / 8; i++) {
1771         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1772         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1773             dd = (nn >> 63) ^ ~INT64_MIN;
1774             q = true;
1775         }
1776         d[i] = dd;
1777     }
1778     if (q) {
1779         uint32_t *qc = vq;
1780         qc[0] = 1;
1781     }
1782     clear_tail(d, oprsz, simd_maxsz(desc));
1783 }
1784 
1785 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1786                           void *vm, uint32_t desc)
1787 {
1788     intptr_t i, oprsz = simd_oprsz(desc);
1789     int64_t *d = vd, *n = vn, *m = vm;
1790     bool q = false;
1791 
1792     for (i = 0; i < oprsz / 8; i++) {
1793         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1794         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1795             dd = (nn >> 63) ^ ~INT64_MIN;
1796             q = true;
1797         }
1798         d[i] = dd;
1799     }
1800     if (q) {
1801         uint32_t *qc = vq;
1802         qc[0] = 1;
1803     }
1804     clear_tail(d, oprsz, simd_maxsz(desc));
1805 }
1806 
1807 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1808                            void *vm, uint32_t desc)
1809 {
1810     intptr_t i, oprsz = simd_oprsz(desc);
1811     uint64_t *d = vd, *n = vn, *m = vm;
1812     bool q = false;
1813 
1814     for (i = 0; i < oprsz / 8; i++) {
1815         uint64_t nn = n[i];
1816         int64_t mm = m[i];
1817         uint64_t dd = nn + mm;
1818 
1819         if (mm < 0) {
1820             if (nn < (uint64_t)-mm) {
1821                 dd = 0;
1822                 q = true;
1823             }
1824         } else {
1825             if (dd < nn) {
1826                 dd = UINT64_MAX;
1827                 q = true;
1828             }
1829         }
1830         d[i] = dd;
1831     }
1832     if (q) {
1833         uint32_t *qc = vq;
1834         qc[0] = 1;
1835     }
1836     clear_tail(d, oprsz, simd_maxsz(desc));
1837 }
1838 
1839 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1840                            void *vm, uint32_t desc)
1841 {
1842     intptr_t i, oprsz = simd_oprsz(desc);
1843     uint64_t *d = vd, *n = vn, *m = vm;
1844     bool q = false;
1845 
1846     for (i = 0; i < oprsz / 8; i++) {
1847         int64_t nn = n[i];
1848         uint64_t mm = m[i];
1849         int64_t dd = nn + mm;
1850 
1851         if (mm > (uint64_t)(INT64_MAX - nn)) {
1852             dd = INT64_MAX;
1853             q = true;
1854         }
1855         d[i] = dd;
1856     }
1857     if (q) {
1858         uint32_t *qc = vq;
1859         qc[0] = 1;
1860     }
1861     clear_tail(d, oprsz, simd_maxsz(desc));
1862 }
1863 
1864 #define DO_SRA(NAME, TYPE)                              \
1865 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1866 {                                                       \
1867     intptr_t i, oprsz = simd_oprsz(desc);               \
1868     int shift = simd_data(desc);                        \
1869     TYPE *d = vd, *n = vn;                              \
1870     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1871         d[i] += n[i] >> shift;                          \
1872     }                                                   \
1873     clear_tail(d, oprsz, simd_maxsz(desc));             \
1874 }
1875 
1876 DO_SRA(gvec_ssra_b, int8_t)
1877 DO_SRA(gvec_ssra_h, int16_t)
1878 DO_SRA(gvec_ssra_s, int32_t)
1879 DO_SRA(gvec_ssra_d, int64_t)
1880 
1881 DO_SRA(gvec_usra_b, uint8_t)
1882 DO_SRA(gvec_usra_h, uint16_t)
1883 DO_SRA(gvec_usra_s, uint32_t)
1884 DO_SRA(gvec_usra_d, uint64_t)
1885 
1886 #undef DO_SRA
1887 
1888 #define DO_RSHR(NAME, TYPE)                             \
1889 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1890 {                                                       \
1891     intptr_t i, oprsz = simd_oprsz(desc);               \
1892     int shift = simd_data(desc);                        \
1893     TYPE *d = vd, *n = vn;                              \
1894     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1895         TYPE tmp = n[i] >> (shift - 1);                 \
1896         d[i] = (tmp >> 1) + (tmp & 1);                  \
1897     }                                                   \
1898     clear_tail(d, oprsz, simd_maxsz(desc));             \
1899 }
1900 
1901 DO_RSHR(gvec_srshr_b, int8_t)
1902 DO_RSHR(gvec_srshr_h, int16_t)
1903 DO_RSHR(gvec_srshr_s, int32_t)
1904 DO_RSHR(gvec_srshr_d, int64_t)
1905 
1906 DO_RSHR(gvec_urshr_b, uint8_t)
1907 DO_RSHR(gvec_urshr_h, uint16_t)
1908 DO_RSHR(gvec_urshr_s, uint32_t)
1909 DO_RSHR(gvec_urshr_d, uint64_t)
1910 
1911 #undef DO_RSHR
1912 
1913 #define DO_RSRA(NAME, TYPE)                             \
1914 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1915 {                                                       \
1916     intptr_t i, oprsz = simd_oprsz(desc);               \
1917     int shift = simd_data(desc);                        \
1918     TYPE *d = vd, *n = vn;                              \
1919     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1920         TYPE tmp = n[i] >> (shift - 1);                 \
1921         d[i] += (tmp >> 1) + (tmp & 1);                 \
1922     }                                                   \
1923     clear_tail(d, oprsz, simd_maxsz(desc));             \
1924 }
1925 
1926 DO_RSRA(gvec_srsra_b, int8_t)
1927 DO_RSRA(gvec_srsra_h, int16_t)
1928 DO_RSRA(gvec_srsra_s, int32_t)
1929 DO_RSRA(gvec_srsra_d, int64_t)
1930 
1931 DO_RSRA(gvec_ursra_b, uint8_t)
1932 DO_RSRA(gvec_ursra_h, uint16_t)
1933 DO_RSRA(gvec_ursra_s, uint32_t)
1934 DO_RSRA(gvec_ursra_d, uint64_t)
1935 
1936 #undef DO_RSRA
1937 
1938 #define DO_SRI(NAME, TYPE)                              \
1939 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1940 {                                                       \
1941     intptr_t i, oprsz = simd_oprsz(desc);               \
1942     int shift = simd_data(desc);                        \
1943     TYPE *d = vd, *n = vn;                              \
1944     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1945         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1946     }                                                   \
1947     clear_tail(d, oprsz, simd_maxsz(desc));             \
1948 }
1949 
1950 DO_SRI(gvec_sri_b, uint8_t)
1951 DO_SRI(gvec_sri_h, uint16_t)
1952 DO_SRI(gvec_sri_s, uint32_t)
1953 DO_SRI(gvec_sri_d, uint64_t)
1954 
1955 #undef DO_SRI
1956 
1957 #define DO_SLI(NAME, TYPE)                              \
1958 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1959 {                                                       \
1960     intptr_t i, oprsz = simd_oprsz(desc);               \
1961     int shift = simd_data(desc);                        \
1962     TYPE *d = vd, *n = vn;                              \
1963     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1964         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1965     }                                                   \
1966     clear_tail(d, oprsz, simd_maxsz(desc));             \
1967 }
1968 
1969 DO_SLI(gvec_sli_b, uint8_t)
1970 DO_SLI(gvec_sli_h, uint16_t)
1971 DO_SLI(gvec_sli_s, uint32_t)
1972 DO_SLI(gvec_sli_d, uint64_t)
1973 
1974 #undef DO_SLI
1975 
1976 /*
1977  * Convert float16 to float32, raising no exceptions and
1978  * preserving exceptional values, including SNaN.
1979  * This is effectively an unpack+repack operation.
1980  */
1981 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1982 {
1983     const int f16_bias = 15;
1984     const int f32_bias = 127;
1985     uint32_t sign = extract32(f16, 15, 1);
1986     uint32_t exp = extract32(f16, 10, 5);
1987     uint32_t frac = extract32(f16, 0, 10);
1988 
1989     if (exp == 0x1f) {
1990         /* Inf or NaN */
1991         exp = 0xff;
1992     } else if (exp == 0) {
1993         /* Zero or denormal.  */
1994         if (frac != 0) {
1995             if (fz16) {
1996                 frac = 0;
1997             } else {
1998                 /*
1999                  * Denormal; these are all normal float32.
2000                  * Shift the fraction so that the msb is at bit 11,
2001                  * then remove bit 11 as the implicit bit of the
2002                  * normalized float32.  Note that we still go through
2003                  * the shift for normal numbers below, to put the
2004                  * float32 fraction at the right place.
2005                  */
2006                 int shift = clz32(frac) - 21;
2007                 frac = (frac << shift) & 0x3ff;
2008                 exp = f32_bias - f16_bias - shift + 1;
2009             }
2010         }
2011     } else {
2012         /* Normal number; adjust the bias.  */
2013         exp += f32_bias - f16_bias;
2014     }
2015     sign <<= 31;
2016     exp <<= 23;
2017     frac <<= 23 - 10;
2018 
2019     return sign | exp | frac;
2020 }
2021 
2022 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2023 {
2024     /*
2025      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2026      * Load the 2nd qword iff is_q & is_2.
2027      * Shift to the 2nd dword iff !is_q & is_2.
2028      * For !is_q & !is_2, the upper bits of the result are garbage.
2029      */
2030     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2031 }
2032 
2033 /*
2034  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2035  * as there is not yet SVE versions that might use blocking.
2036  */
2037 
2038 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2039                      uint32_t desc, bool fz16)
2040 {
2041     intptr_t i, oprsz = simd_oprsz(desc);
2042     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2043     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2044     int is_q = oprsz == 16;
2045     uint64_t n_4, m_4;
2046 
2047     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2048     n_4 = load4_f16(vn, is_q, is_2);
2049     m_4 = load4_f16(vm, is_q, is_2);
2050 
2051     /* Negate all inputs for FMLSL at once.  */
2052     if (is_s) {
2053         n_4 ^= 0x8000800080008000ull;
2054     }
2055 
2056     for (i = 0; i < oprsz / 4; i++) {
2057         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2058         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2059         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2060     }
2061     clear_tail(d, oprsz, simd_maxsz(desc));
2062 }
2063 
2064 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2065                             void *venv, uint32_t desc)
2066 {
2067     CPUARMState *env = venv;
2068     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2069              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2070 }
2071 
2072 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2073                             void *venv, uint32_t desc)
2074 {
2075     CPUARMState *env = venv;
2076     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2077              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2078 }
2079 
2080 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2081                                void *venv, uint32_t desc)
2082 {
2083     intptr_t i, oprsz = simd_oprsz(desc);
2084     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2085     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2086     CPUARMState *env = venv;
2087     float_status *status = &env->vfp.fp_status;
2088     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2089 
2090     for (i = 0; i < oprsz; i += sizeof(float32)) {
2091         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2092         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2093         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2094         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2095         float32 aa = *(float32 *)(va + H1_4(i));
2096 
2097         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2098     }
2099 }
2100 
2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2102                          uint32_t desc, bool fz16)
2103 {
2104     intptr_t i, oprsz = simd_oprsz(desc);
2105     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2106     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2107     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2108     int is_q = oprsz == 16;
2109     uint64_t n_4;
2110     float32 m_1;
2111 
2112     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2113     n_4 = load4_f16(vn, is_q, is_2);
2114 
2115     /* Negate all inputs for FMLSL at once.  */
2116     if (is_s) {
2117         n_4 ^= 0x8000800080008000ull;
2118     }
2119 
2120     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2121 
2122     for (i = 0; i < oprsz / 4; i++) {
2123         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2124         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2125     }
2126     clear_tail(d, oprsz, simd_maxsz(desc));
2127 }
2128 
2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2130                                 void *venv, uint32_t desc)
2131 {
2132     CPUARMState *env = venv;
2133     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2134                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2135 }
2136 
2137 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2138                                 void *venv, uint32_t desc)
2139 {
2140     CPUARMState *env = venv;
2141     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2142                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2143 }
2144 
2145 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2146                                void *venv, uint32_t desc)
2147 {
2148     intptr_t i, j, oprsz = simd_oprsz(desc);
2149     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2150     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2151     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2152     CPUARMState *env = venv;
2153     float_status *status = &env->vfp.fp_status;
2154     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2155 
2156     for (i = 0; i < oprsz; i += 16) {
2157         float16 mm_16 = *(float16 *)(vm + i + idx);
2158         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2159 
2160         for (j = 0; j < 16; j += sizeof(float32)) {
2161             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2162             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2163             float32 aa = *(float32 *)(va + H1_4(i + j));
2164 
2165             *(float32 *)(vd + H1_4(i + j)) =
2166                 float32_muladd(nn, mm, aa, 0, status);
2167         }
2168     }
2169 }
2170 
2171 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2172 {
2173     intptr_t i, opr_sz = simd_oprsz(desc);
2174     int8_t *d = vd, *n = vn, *m = vm;
2175 
2176     for (i = 0; i < opr_sz; ++i) {
2177         int8_t mm = m[i];
2178         int8_t nn = n[i];
2179         int8_t res = 0;
2180         if (mm >= 0) {
2181             if (mm < 8) {
2182                 res = nn << mm;
2183             }
2184         } else {
2185             res = nn >> (mm > -8 ? -mm : 7);
2186         }
2187         d[i] = res;
2188     }
2189     clear_tail(d, opr_sz, simd_maxsz(desc));
2190 }
2191 
2192 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2193 {
2194     intptr_t i, opr_sz = simd_oprsz(desc);
2195     int16_t *d = vd, *n = vn, *m = vm;
2196 
2197     for (i = 0; i < opr_sz / 2; ++i) {
2198         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2199         int16_t nn = n[i];
2200         int16_t res = 0;
2201         if (mm >= 0) {
2202             if (mm < 16) {
2203                 res = nn << mm;
2204             }
2205         } else {
2206             res = nn >> (mm > -16 ? -mm : 15);
2207         }
2208         d[i] = res;
2209     }
2210     clear_tail(d, opr_sz, simd_maxsz(desc));
2211 }
2212 
2213 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2214 {
2215     intptr_t i, opr_sz = simd_oprsz(desc);
2216     uint8_t *d = vd, *n = vn, *m = vm;
2217 
2218     for (i = 0; i < opr_sz; ++i) {
2219         int8_t mm = m[i];
2220         uint8_t nn = n[i];
2221         uint8_t res = 0;
2222         if (mm >= 0) {
2223             if (mm < 8) {
2224                 res = nn << mm;
2225             }
2226         } else {
2227             if (mm > -8) {
2228                 res = nn >> -mm;
2229             }
2230         }
2231         d[i] = res;
2232     }
2233     clear_tail(d, opr_sz, simd_maxsz(desc));
2234 }
2235 
2236 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2237 {
2238     intptr_t i, opr_sz = simd_oprsz(desc);
2239     uint16_t *d = vd, *n = vn, *m = vm;
2240 
2241     for (i = 0; i < opr_sz / 2; ++i) {
2242         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2243         uint16_t nn = n[i];
2244         uint16_t res = 0;
2245         if (mm >= 0) {
2246             if (mm < 16) {
2247                 res = nn << mm;
2248             }
2249         } else {
2250             if (mm > -16) {
2251                 res = nn >> -mm;
2252             }
2253         }
2254         d[i] = res;
2255     }
2256     clear_tail(d, opr_sz, simd_maxsz(desc));
2257 }
2258 
2259 /*
2260  * 8x8->8 polynomial multiply.
2261  *
2262  * Polynomial multiplication is like integer multiplication except the
2263  * partial products are XORed, not added.
2264  *
2265  * TODO: expose this as a generic vector operation, as it is a common
2266  * crypto building block.
2267  */
2268 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2269 {
2270     intptr_t i, opr_sz = simd_oprsz(desc);
2271     uint64_t *d = vd, *n = vn, *m = vm;
2272 
2273     for (i = 0; i < opr_sz / 8; ++i) {
2274         d[i] = clmul_8x8_low(n[i], m[i]);
2275     }
2276     clear_tail(d, opr_sz, simd_maxsz(desc));
2277 }
2278 
2279 /*
2280  * 64x64->128 polynomial multiply.
2281  * Because of the lanes are not accessed in strict columns,
2282  * this probably cannot be turned into a generic helper.
2283  */
2284 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2285 {
2286     intptr_t i, opr_sz = simd_oprsz(desc);
2287     intptr_t hi = simd_data(desc);
2288     uint64_t *d = vd, *n = vn, *m = vm;
2289 
2290     for (i = 0; i < opr_sz / 8; i += 2) {
2291         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2292         d[i] = int128_getlo(r);
2293         d[i + 1] = int128_gethi(r);
2294     }
2295     clear_tail(d, opr_sz, simd_maxsz(desc));
2296 }
2297 
2298 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2299 {
2300     int hi = simd_data(desc);
2301     uint64_t *d = vd, *n = vn, *m = vm;
2302     uint64_t nn = n[hi], mm = m[hi];
2303 
2304     d[0] = clmul_8x4_packed(nn, mm);
2305     nn >>= 32;
2306     mm >>= 32;
2307     d[1] = clmul_8x4_packed(nn, mm);
2308 
2309     clear_tail(d, 16, simd_maxsz(desc));
2310 }
2311 
2312 #ifdef TARGET_AARCH64
2313 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2314 {
2315     int shift = simd_data(desc) * 8;
2316     intptr_t i, opr_sz = simd_oprsz(desc);
2317     uint64_t *d = vd, *n = vn, *m = vm;
2318 
2319     for (i = 0; i < opr_sz / 8; ++i) {
2320         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2321     }
2322 }
2323 
2324 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2325 {
2326     intptr_t sel = H4(simd_data(desc));
2327     intptr_t i, opr_sz = simd_oprsz(desc);
2328     uint32_t *n = vn, *m = vm;
2329     uint64_t *d = vd;
2330 
2331     for (i = 0; i < opr_sz / 8; ++i) {
2332         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2333     }
2334 }
2335 #endif
2336 
2337 #define DO_CMP0(NAME, TYPE, OP)                         \
2338 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2339 {                                                       \
2340     intptr_t i, opr_sz = simd_oprsz(desc);              \
2341     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2342         TYPE nn = *(TYPE *)(vn + i);                    \
2343         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2344     }                                                   \
2345     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2346 }
2347 
2348 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2349 DO_CMP0(gvec_clt0_b, int8_t, <)
2350 DO_CMP0(gvec_cle0_b, int8_t, <=)
2351 DO_CMP0(gvec_cgt0_b, int8_t, >)
2352 DO_CMP0(gvec_cge0_b, int8_t, >=)
2353 
2354 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2355 DO_CMP0(gvec_clt0_h, int16_t, <)
2356 DO_CMP0(gvec_cle0_h, int16_t, <=)
2357 DO_CMP0(gvec_cgt0_h, int16_t, >)
2358 DO_CMP0(gvec_cge0_h, int16_t, >=)
2359 
2360 #undef DO_CMP0
2361 
2362 #define DO_ABD(NAME, TYPE)                                      \
2363 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2364 {                                                               \
2365     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2366     TYPE *d = vd, *n = vn, *m = vm;                             \
2367                                                                 \
2368     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2369         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2370     }                                                           \
2371     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2372 }
2373 
2374 DO_ABD(gvec_sabd_b, int8_t)
2375 DO_ABD(gvec_sabd_h, int16_t)
2376 DO_ABD(gvec_sabd_s, int32_t)
2377 DO_ABD(gvec_sabd_d, int64_t)
2378 
2379 DO_ABD(gvec_uabd_b, uint8_t)
2380 DO_ABD(gvec_uabd_h, uint16_t)
2381 DO_ABD(gvec_uabd_s, uint32_t)
2382 DO_ABD(gvec_uabd_d, uint64_t)
2383 
2384 #undef DO_ABD
2385 
2386 #define DO_ABA(NAME, TYPE)                                      \
2387 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2388 {                                                               \
2389     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2390     TYPE *d = vd, *n = vn, *m = vm;                             \
2391                                                                 \
2392     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2393         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2394     }                                                           \
2395     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2396 }
2397 
2398 DO_ABA(gvec_saba_b, int8_t)
2399 DO_ABA(gvec_saba_h, int16_t)
2400 DO_ABA(gvec_saba_s, int32_t)
2401 DO_ABA(gvec_saba_d, int64_t)
2402 
2403 DO_ABA(gvec_uaba_b, uint8_t)
2404 DO_ABA(gvec_uaba_h, uint16_t)
2405 DO_ABA(gvec_uaba_s, uint32_t)
2406 DO_ABA(gvec_uaba_d, uint64_t)
2407 
2408 #undef DO_ABA
2409 
2410 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2411 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2412 {                                                                          \
2413     ARMVectorReg scratch;                                                  \
2414     intptr_t oprsz = simd_oprsz(desc);                                     \
2415     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2416     TYPE *d = vd, *n = vn, *m = vm;                                        \
2417     if (unlikely(d == m)) {                                                \
2418         m = memcpy(&scratch, m, oprsz);                                    \
2419     }                                                                      \
2420     for (intptr_t i = 0; i < half; ++i) {                                  \
2421         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2422     }                                                                      \
2423     for (intptr_t i = 0; i < half; ++i) {                                  \
2424         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2425     }                                                                      \
2426     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2427 }
2428 
2429 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2430 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2431 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2432 
2433 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2434 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2435 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2436 
2437 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2438 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2439 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2440 
2441 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2442 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2443 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2444 
2445 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2446 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2447 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2448 
2449 #undef DO_3OP_PAIR
2450 
2451 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2452 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2453 {                                                               \
2454     ARMVectorReg scratch;                                       \
2455     intptr_t oprsz = simd_oprsz(desc);                          \
2456     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2457     TYPE *d = vd, *n = vn, *m = vm;                             \
2458     if (unlikely(d == m)) {                                     \
2459         m = memcpy(&scratch, m, oprsz);                         \
2460     }                                                           \
2461     for (intptr_t i = 0; i < half; ++i) {                       \
2462         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2463     }                                                           \
2464     for (intptr_t i = 0; i < half; ++i) {                       \
2465         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2466     }                                                           \
2467     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2468 }
2469 
2470 #define ADD(A, B) (A + B)
2471 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2472 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2473 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2474 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2475 #undef  ADD
2476 
2477 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2478 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2479 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2480 
2481 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2482 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2483 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2484 
2485 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2486 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2487 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2488 
2489 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2490 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2491 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2492 
2493 #undef DO_3OP_PAIR
2494 
2495 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2496     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2497     {                                                                   \
2498         intptr_t i, oprsz = simd_oprsz(desc);                           \
2499         int shift = simd_data(desc);                                    \
2500         TYPE *d = vd, *n = vn;                                          \
2501         float_status *fpst = stat;                                      \
2502         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2503             d[i] = FUNC(n[i], shift, fpst);                             \
2504         }                                                               \
2505         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2506     }
2507 
2508 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2509 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2510 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2511 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2512 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2513 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2514 
2515 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2516 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2517 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2518 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2519 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2520 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2521 
2522 #undef DO_VCVT_FIXED
2523 
2524 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2525     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2526     {                                                                   \
2527         float_status *fpst = stat;                                      \
2528         intptr_t i, oprsz = simd_oprsz(desc);                           \
2529         uint32_t rmode = simd_data(desc);                               \
2530         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2531         TYPE *d = vd, *n = vn;                                          \
2532         set_float_rounding_mode(rmode, fpst);                           \
2533         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2534             d[i] = FUNC(n[i], 0, fpst);                                 \
2535         }                                                               \
2536         set_float_rounding_mode(prev_rmode, fpst);                      \
2537         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2538     }
2539 
2540 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2541 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2542 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2543 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2544 
2545 #undef DO_VCVT_RMODE
2546 
2547 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2548     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2549     {                                                                   \
2550         float_status *fpst = stat;                                      \
2551         intptr_t i, oprsz = simd_oprsz(desc);                           \
2552         uint32_t rmode = simd_data(desc);                               \
2553         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2554         TYPE *d = vd, *n = vn;                                          \
2555         set_float_rounding_mode(rmode, fpst);                           \
2556         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2557             d[i] = FUNC(n[i], fpst);                                    \
2558         }                                                               \
2559         set_float_rounding_mode(prev_rmode, fpst);                      \
2560         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2561     }
2562 
2563 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2564 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2565 
2566 #undef DO_VRINT_RMODE
2567 
2568 #ifdef TARGET_AARCH64
2569 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2570 {
2571     const uint8_t *indices = vm;
2572     CPUARMState *env = venv;
2573     size_t oprsz = simd_oprsz(desc);
2574     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2575     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2576     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2577     union {
2578         uint8_t b[16];
2579         uint64_t d[2];
2580     } result;
2581 
2582     /*
2583      * We must construct the final result in a temp, lest the output
2584      * overlaps the input table.  For TBL, begin with zero; for TBX,
2585      * begin with the original register contents.  Note that we always
2586      * copy 16 bytes here to avoid an extra branch; clearing the high
2587      * bits of the register for oprsz == 8 is handled below.
2588      */
2589     if (is_tbx) {
2590         memcpy(&result, vd, 16);
2591     } else {
2592         memset(&result, 0, 16);
2593     }
2594 
2595     for (size_t i = 0; i < oprsz; ++i) {
2596         uint32_t index = indices[H1(i)];
2597 
2598         if (index < table_len) {
2599             /*
2600              * Convert index (a byte offset into the virtual table
2601              * which is a series of 128-bit vectors concatenated)
2602              * into the correct register element, bearing in mind
2603              * that the table can wrap around from V31 to V0.
2604              */
2605             const uint8_t *table = (const uint8_t *)
2606                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2607             result.b[H1(i)] = table[H1(index % 16)];
2608         }
2609     }
2610 
2611     memcpy(vd, &result, 16);
2612     clear_tail(vd, oprsz, simd_maxsz(desc));
2613 }
2614 #endif
2615 
2616 /*
2617  * NxN -> N highpart multiply
2618  *
2619  * TODO: expose this as a generic vector operation.
2620  */
2621 
2622 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2623 {
2624     intptr_t i, opr_sz = simd_oprsz(desc);
2625     int8_t *d = vd, *n = vn, *m = vm;
2626 
2627     for (i = 0; i < opr_sz; ++i) {
2628         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2629     }
2630     clear_tail(d, opr_sz, simd_maxsz(desc));
2631 }
2632 
2633 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2634 {
2635     intptr_t i, opr_sz = simd_oprsz(desc);
2636     int16_t *d = vd, *n = vn, *m = vm;
2637 
2638     for (i = 0; i < opr_sz / 2; ++i) {
2639         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2640     }
2641     clear_tail(d, opr_sz, simd_maxsz(desc));
2642 }
2643 
2644 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2645 {
2646     intptr_t i, opr_sz = simd_oprsz(desc);
2647     int32_t *d = vd, *n = vn, *m = vm;
2648 
2649     for (i = 0; i < opr_sz / 4; ++i) {
2650         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2651     }
2652     clear_tail(d, opr_sz, simd_maxsz(desc));
2653 }
2654 
2655 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2656 {
2657     intptr_t i, opr_sz = simd_oprsz(desc);
2658     uint64_t *d = vd, *n = vn, *m = vm;
2659     uint64_t discard;
2660 
2661     for (i = 0; i < opr_sz / 8; ++i) {
2662         muls64(&discard, &d[i], n[i], m[i]);
2663     }
2664     clear_tail(d, opr_sz, simd_maxsz(desc));
2665 }
2666 
2667 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2668 {
2669     intptr_t i, opr_sz = simd_oprsz(desc);
2670     uint8_t *d = vd, *n = vn, *m = vm;
2671 
2672     for (i = 0; i < opr_sz; ++i) {
2673         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2674     }
2675     clear_tail(d, opr_sz, simd_maxsz(desc));
2676 }
2677 
2678 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2679 {
2680     intptr_t i, opr_sz = simd_oprsz(desc);
2681     uint16_t *d = vd, *n = vn, *m = vm;
2682 
2683     for (i = 0; i < opr_sz / 2; ++i) {
2684         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2685     }
2686     clear_tail(d, opr_sz, simd_maxsz(desc));
2687 }
2688 
2689 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2690 {
2691     intptr_t i, opr_sz = simd_oprsz(desc);
2692     uint32_t *d = vd, *n = vn, *m = vm;
2693 
2694     for (i = 0; i < opr_sz / 4; ++i) {
2695         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2696     }
2697     clear_tail(d, opr_sz, simd_maxsz(desc));
2698 }
2699 
2700 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2701 {
2702     intptr_t i, opr_sz = simd_oprsz(desc);
2703     uint64_t *d = vd, *n = vn, *m = vm;
2704     uint64_t discard;
2705 
2706     for (i = 0; i < opr_sz / 8; ++i) {
2707         mulu64(&discard, &d[i], n[i], m[i]);
2708     }
2709     clear_tail(d, opr_sz, simd_maxsz(desc));
2710 }
2711 
2712 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2713 {
2714     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2715     int shr = simd_data(desc);
2716     uint64_t *d = vd, *n = vn, *m = vm;
2717 
2718     for (i = 0; i < opr_sz; ++i) {
2719         d[i] = ror64(n[i] ^ m[i], shr);
2720     }
2721     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2722 }
2723 
2724 /*
2725  * Integer matrix-multiply accumulate
2726  */
2727 
2728 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2729 {
2730     int8_t *n = vn, *m = vm;
2731 
2732     for (intptr_t k = 0; k < 8; ++k) {
2733         sum += n[H1(k)] * m[H1(k)];
2734     }
2735     return sum;
2736 }
2737 
2738 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2739 {
2740     uint8_t *n = vn, *m = vm;
2741 
2742     for (intptr_t k = 0; k < 8; ++k) {
2743         sum += n[H1(k)] * m[H1(k)];
2744     }
2745     return sum;
2746 }
2747 
2748 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2749 {
2750     uint8_t *n = vn;
2751     int8_t *m = vm;
2752 
2753     for (intptr_t k = 0; k < 8; ++k) {
2754         sum += n[H1(k)] * m[H1(k)];
2755     }
2756     return sum;
2757 }
2758 
2759 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2760                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2761 {
2762     intptr_t seg, opr_sz = simd_oprsz(desc);
2763 
2764     for (seg = 0; seg < opr_sz; seg += 16) {
2765         uint32_t *d = vd + seg;
2766         uint32_t *a = va + seg;
2767         uint32_t sum0, sum1, sum2, sum3;
2768 
2769         /*
2770          * Process the entire segment at once, writing back the
2771          * results only after we've consumed all of the inputs.
2772          *
2773          * Key to indices by column:
2774          *          i   j                  i             j
2775          */
2776         sum0 = a[H4(0 + 0)];
2777         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2778         sum1 = a[H4(0 + 1)];
2779         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2780         sum2 = a[H4(2 + 0)];
2781         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2782         sum3 = a[H4(2 + 1)];
2783         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2784 
2785         d[H4(0)] = sum0;
2786         d[H4(1)] = sum1;
2787         d[H4(2)] = sum2;
2788         d[H4(3)] = sum3;
2789     }
2790     clear_tail(vd, opr_sz, simd_maxsz(desc));
2791 }
2792 
2793 #define DO_MMLA_B(NAME, INNER) \
2794     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2795     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2796 
2797 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2798 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2799 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2800 
2801 /*
2802  * BFloat16 Dot Product
2803  */
2804 
2805 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2806 {
2807     /*
2808      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2809      * For EBF = 0, we ignore the FPCR bits which determine rounding
2810      * mode and denormal-flushing, and we do unfused multiplies and
2811      * additions with intermediate rounding of all products and sums.
2812      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2813      * and we perform a fused two-way sum-of-products without intermediate
2814      * rounding of the products.
2815      * In either case, we don't set fp exception flags.
2816      *
2817      * EBF is AArch64 only, so even if it's set in the FPCR it has
2818      * no effect on AArch32 instructions.
2819      */
2820     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2821 
2822     *statusp = env->vfp.fp_status;
2823     set_default_nan_mode(true, statusp);
2824 
2825     if (ebf) {
2826         /* EBF=1 needs to do a step with round-to-odd semantics */
2827         *oddstatusp = *statusp;
2828         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2829     } else {
2830         set_flush_to_zero(true, statusp);
2831         set_flush_inputs_to_zero(true, statusp);
2832         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2833     }
2834     return ebf;
2835 }
2836 
2837 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2838 {
2839     float32 t1, t2;
2840 
2841     /*
2842      * Extract each BFloat16 from the element pair, and shift
2843      * them such that they become float32.
2844      */
2845     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2846     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2847     t1 = float32_add(t1, t2, fpst);
2848     t1 = float32_add(sum, t1, fpst);
2849 
2850     return t1;
2851 }
2852 
2853 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2854                      float_status *fpst, float_status *fpst_odd)
2855 {
2856     /*
2857      * Compare f16_dotadd() in sme_helper.c, but here we have
2858      * bfloat16 inputs. In particular that means that we do not
2859      * want the FPCR.FZ16 flush semantics, so we use the normal
2860      * float_status for the input handling here.
2861      */
2862     float64 e1r = float32_to_float64(e1 << 16, fpst);
2863     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2864     float64 e2r = float32_to_float64(e2 << 16, fpst);
2865     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2866     float64 t64;
2867     float32 t32;
2868 
2869     /*
2870      * The ARM pseudocode function FPDot performs both multiplies
2871      * and the add with a single rounding operation.  Emulate this
2872      * by performing the first multiply in round-to-odd, then doing
2873      * the second multiply as fused multiply-add, and rounding to
2874      * float32 all in one step.
2875      */
2876     t64 = float64_mul(e1r, e2r, fpst_odd);
2877     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2878 
2879     /* This conversion is exact, because we've already rounded. */
2880     t32 = float64_to_float32(t64, fpst);
2881 
2882     /* The final accumulation step is not fused. */
2883     return float32_add(sum, t32, fpst);
2884 }
2885 
2886 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2887                         CPUARMState *env, uint32_t desc)
2888 {
2889     intptr_t i, opr_sz = simd_oprsz(desc);
2890     float32 *d = vd, *a = va;
2891     uint32_t *n = vn, *m = vm;
2892     float_status fpst, fpst_odd;
2893 
2894     if (is_ebf(env, &fpst, &fpst_odd)) {
2895         for (i = 0; i < opr_sz / 4; ++i) {
2896             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2897         }
2898     } else {
2899         for (i = 0; i < opr_sz / 4; ++i) {
2900             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2901         }
2902     }
2903     clear_tail(d, opr_sz, simd_maxsz(desc));
2904 }
2905 
2906 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2907                             void *va, CPUARMState *env, uint32_t desc)
2908 {
2909     intptr_t i, j, opr_sz = simd_oprsz(desc);
2910     intptr_t index = simd_data(desc);
2911     intptr_t elements = opr_sz / 4;
2912     intptr_t eltspersegment = MIN(16 / 4, elements);
2913     float32 *d = vd, *a = va;
2914     uint32_t *n = vn, *m = vm;
2915     float_status fpst, fpst_odd;
2916 
2917     if (is_ebf(env, &fpst, &fpst_odd)) {
2918         for (i = 0; i < elements; i += eltspersegment) {
2919             uint32_t m_idx = m[i + H4(index)];
2920 
2921             for (j = i; j < i + eltspersegment; j++) {
2922                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2923             }
2924         }
2925     } else {
2926         for (i = 0; i < elements; i += eltspersegment) {
2927             uint32_t m_idx = m[i + H4(index)];
2928 
2929             for (j = i; j < i + eltspersegment; j++) {
2930                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2931             }
2932         }
2933     }
2934     clear_tail(d, opr_sz, simd_maxsz(desc));
2935 }
2936 
2937 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2938                          CPUARMState *env, uint32_t desc)
2939 {
2940     intptr_t s, opr_sz = simd_oprsz(desc);
2941     float32 *d = vd, *a = va;
2942     uint32_t *n = vn, *m = vm;
2943     float_status fpst, fpst_odd;
2944 
2945     if (is_ebf(env, &fpst, &fpst_odd)) {
2946         for (s = 0; s < opr_sz / 4; s += 4) {
2947             float32 sum00, sum01, sum10, sum11;
2948 
2949             /*
2950              * Process the entire segment at once, writing back the
2951              * results only after we've consumed all of the inputs.
2952              *
2953              * Key to indices by column:
2954              *               i   j               i   k             j   k
2955              */
2956             sum00 = a[s + H4(0 + 0)];
2957             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2958             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2959 
2960             sum01 = a[s + H4(0 + 1)];
2961             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2962             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2963 
2964             sum10 = a[s + H4(2 + 0)];
2965             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2966             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2967 
2968             sum11 = a[s + H4(2 + 1)];
2969             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2970             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2971 
2972             d[s + H4(0 + 0)] = sum00;
2973             d[s + H4(0 + 1)] = sum01;
2974             d[s + H4(2 + 0)] = sum10;
2975             d[s + H4(2 + 1)] = sum11;
2976         }
2977     } else {
2978         for (s = 0; s < opr_sz / 4; s += 4) {
2979             float32 sum00, sum01, sum10, sum11;
2980 
2981             /*
2982              * Process the entire segment at once, writing back the
2983              * results only after we've consumed all of the inputs.
2984              *
2985              * Key to indices by column:
2986              *               i   j           i   k             j   k
2987              */
2988             sum00 = a[s + H4(0 + 0)];
2989             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2990             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2991 
2992             sum01 = a[s + H4(0 + 1)];
2993             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2994             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2995 
2996             sum10 = a[s + H4(2 + 0)];
2997             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2998             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
2999 
3000             sum11 = a[s + H4(2 + 1)];
3001             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3002             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3003 
3004             d[s + H4(0 + 0)] = sum00;
3005             d[s + H4(0 + 1)] = sum01;
3006             d[s + H4(2 + 0)] = sum10;
3007             d[s + H4(2 + 1)] = sum11;
3008         }
3009     }
3010     clear_tail(d, opr_sz, simd_maxsz(desc));
3011 }
3012 
3013 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3014                          void *stat, uint32_t desc)
3015 {
3016     intptr_t i, opr_sz = simd_oprsz(desc);
3017     intptr_t sel = simd_data(desc);
3018     float32 *d = vd, *a = va;
3019     bfloat16 *n = vn, *m = vm;
3020 
3021     for (i = 0; i < opr_sz / 4; ++i) {
3022         float32 nn = n[H2(i * 2 + sel)] << 16;
3023         float32 mm = m[H2(i * 2 + sel)] << 16;
3024         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3025     }
3026     clear_tail(d, opr_sz, simd_maxsz(desc));
3027 }
3028 
3029 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3030                              void *va, void *stat, uint32_t desc)
3031 {
3032     intptr_t i, j, opr_sz = simd_oprsz(desc);
3033     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3034     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3035     intptr_t elements = opr_sz / 4;
3036     intptr_t eltspersegment = MIN(16 / 4, elements);
3037     float32 *d = vd, *a = va;
3038     bfloat16 *n = vn, *m = vm;
3039 
3040     for (i = 0; i < elements; i += eltspersegment) {
3041         float32 m_idx = m[H2(2 * i + index)] << 16;
3042 
3043         for (j = i; j < i + eltspersegment; j++) {
3044             float32 n_j = n[H2(2 * j + sel)] << 16;
3045             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3046         }
3047     }
3048     clear_tail(d, opr_sz, simd_maxsz(desc));
3049 }
3050 
3051 #define DO_CLAMP(NAME, TYPE) \
3052 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3053 {                                                                       \
3054     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3055     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3056         TYPE aa = *(TYPE *)(a + i);                                     \
3057         TYPE nn = *(TYPE *)(n + i);                                     \
3058         TYPE mm = *(TYPE *)(m + i);                                     \
3059         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3060         *(TYPE *)(d + i) = dd;                                          \
3061     }                                                                   \
3062     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3063 }
3064 
3065 DO_CLAMP(gvec_sclamp_b, int8_t)
3066 DO_CLAMP(gvec_sclamp_h, int16_t)
3067 DO_CLAMP(gvec_sclamp_s, int32_t)
3068 DO_CLAMP(gvec_sclamp_d, int64_t)
3069 
3070 DO_CLAMP(gvec_uclamp_b, uint8_t)
3071 DO_CLAMP(gvec_uclamp_h, uint16_t)
3072 DO_CLAMP(gvec_uclamp_s, uint32_t)
3073 DO_CLAMP(gvec_uclamp_d, uint64_t)
3074 
3075 /* Bit count in each 8-bit word. */
3076 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3077 {
3078     intptr_t i, opr_sz = simd_oprsz(desc);
3079     uint8_t *d = vd, *n = vn;
3080 
3081     for (i = 0; i < opr_sz; ++i) {
3082         d[i] = ctpop8(n[i]);
3083     }
3084     clear_tail(d, opr_sz, simd_maxsz(desc));
3085 }
3086 
3087 /* Reverse bits in each 8 bit word */
3088 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3089 {
3090     intptr_t i, opr_sz = simd_oprsz(desc);
3091     uint64_t *d = vd, *n = vn;
3092 
3093     for (i = 0; i < opr_sz / 8; ++i) {
3094         d[i] = revbit64(bswap64(n[i]));
3095     }
3096     clear_tail(d, opr_sz, simd_maxsz(desc));
3097 }
3098