xref: /qemu/target/arm/tcg/vec_helper.c (revision 53b9486be7e182cca15190e01d2d08647949515e)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          void *vfpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     float_status *fpst = vfpst;
883     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
884     uint32_t neg_imag = neg_real ^ 1;
885     uintptr_t i;
886 
887     /* Shift boolean to the sign bit so we can xor to negate.  */
888     neg_real <<= 15;
889     neg_imag <<= 15;
890 
891     for (i = 0; i < opr_sz / 2; i += 2) {
892         float16 e0 = n[H2(i)];
893         float16 e1 = m[H2(i + 1)] ^ neg_imag;
894         float16 e2 = n[H2(i + 1)];
895         float16 e3 = m[H2(i)] ^ neg_real;
896 
897         d[H2(i)] = float16_add(e0, e1, fpst);
898         d[H2(i + 1)] = float16_add(e2, e3, fpst);
899     }
900     clear_tail(d, opr_sz, simd_maxsz(desc));
901 }
902 
903 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
904                          void *vfpst, uint32_t desc)
905 {
906     uintptr_t opr_sz = simd_oprsz(desc);
907     float32 *d = vd;
908     float32 *n = vn;
909     float32 *m = vm;
910     float_status *fpst = vfpst;
911     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
912     uint32_t neg_imag = neg_real ^ 1;
913     uintptr_t i;
914 
915     /* Shift boolean to the sign bit so we can xor to negate.  */
916     neg_real <<= 31;
917     neg_imag <<= 31;
918 
919     for (i = 0; i < opr_sz / 4; i += 2) {
920         float32 e0 = n[H4(i)];
921         float32 e1 = m[H4(i + 1)] ^ neg_imag;
922         float32 e2 = n[H4(i + 1)];
923         float32 e3 = m[H4(i)] ^ neg_real;
924 
925         d[H4(i)] = float32_add(e0, e1, fpst);
926         d[H4(i + 1)] = float32_add(e2, e3, fpst);
927     }
928     clear_tail(d, opr_sz, simd_maxsz(desc));
929 }
930 
931 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
932                          void *vfpst, uint32_t desc)
933 {
934     uintptr_t opr_sz = simd_oprsz(desc);
935     float64 *d = vd;
936     float64 *n = vn;
937     float64 *m = vm;
938     float_status *fpst = vfpst;
939     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
940     uint64_t neg_imag = neg_real ^ 1;
941     uintptr_t i;
942 
943     /* Shift boolean to the sign bit so we can xor to negate.  */
944     neg_real <<= 63;
945     neg_imag <<= 63;
946 
947     for (i = 0; i < opr_sz / 8; i += 2) {
948         float64 e0 = n[i];
949         float64 e1 = m[i + 1] ^ neg_imag;
950         float64 e2 = n[i + 1];
951         float64 e3 = m[i] ^ neg_real;
952 
953         d[i] = float64_add(e0, e1, fpst);
954         d[i + 1] = float64_add(e2, e3, fpst);
955     }
956     clear_tail(d, opr_sz, simd_maxsz(desc));
957 }
958 
959 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
960                          void *vfpst, uint32_t desc)
961 {
962     uintptr_t opr_sz = simd_oprsz(desc);
963     float16 *d = vd, *n = vn, *m = vm, *a = va;
964     float_status *fpst = vfpst;
965     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
966     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
967     uint32_t neg_real = flip ^ neg_imag;
968     uintptr_t i;
969 
970     /* Shift boolean to the sign bit so we can xor to negate.  */
971     neg_real <<= 15;
972     neg_imag <<= 15;
973 
974     for (i = 0; i < opr_sz / 2; i += 2) {
975         float16 e2 = n[H2(i + flip)];
976         float16 e1 = m[H2(i + flip)] ^ neg_real;
977         float16 e4 = e2;
978         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
979 
980         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
981         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
982     }
983     clear_tail(d, opr_sz, simd_maxsz(desc));
984 }
985 
986 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
987                              void *vfpst, uint32_t desc)
988 {
989     uintptr_t opr_sz = simd_oprsz(desc);
990     float16 *d = vd, *n = vn, *m = vm, *a = va;
991     float_status *fpst = vfpst;
992     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
993     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
994     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
995     uint32_t neg_real = flip ^ neg_imag;
996     intptr_t elements = opr_sz / sizeof(float16);
997     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
998     intptr_t i, j;
999 
1000     /* Shift boolean to the sign bit so we can xor to negate.  */
1001     neg_real <<= 15;
1002     neg_imag <<= 15;
1003 
1004     for (i = 0; i < elements; i += eltspersegment) {
1005         float16 mr = m[H2(i + 2 * index + 0)];
1006         float16 mi = m[H2(i + 2 * index + 1)];
1007         float16 e1 = neg_real ^ (flip ? mi : mr);
1008         float16 e3 = neg_imag ^ (flip ? mr : mi);
1009 
1010         for (j = i; j < i + eltspersegment; j += 2) {
1011             float16 e2 = n[H2(j + flip)];
1012             float16 e4 = e2;
1013 
1014             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1015             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1016         }
1017     }
1018     clear_tail(d, opr_sz, simd_maxsz(desc));
1019 }
1020 
1021 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1022                          void *vfpst, uint32_t desc)
1023 {
1024     uintptr_t opr_sz = simd_oprsz(desc);
1025     float32 *d = vd, *n = vn, *m = vm, *a = va;
1026     float_status *fpst = vfpst;
1027     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1028     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1029     uint32_t neg_real = flip ^ neg_imag;
1030     uintptr_t i;
1031 
1032     /* Shift boolean to the sign bit so we can xor to negate.  */
1033     neg_real <<= 31;
1034     neg_imag <<= 31;
1035 
1036     for (i = 0; i < opr_sz / 4; i += 2) {
1037         float32 e2 = n[H4(i + flip)];
1038         float32 e1 = m[H4(i + flip)] ^ neg_real;
1039         float32 e4 = e2;
1040         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1041 
1042         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1043         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1044     }
1045     clear_tail(d, opr_sz, simd_maxsz(desc));
1046 }
1047 
1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1049                              void *vfpst, uint32_t desc)
1050 {
1051     uintptr_t opr_sz = simd_oprsz(desc);
1052     float32 *d = vd, *n = vn, *m = vm, *a = va;
1053     float_status *fpst = vfpst;
1054     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1055     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1056     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1057     uint32_t neg_real = flip ^ neg_imag;
1058     intptr_t elements = opr_sz / sizeof(float32);
1059     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1060     intptr_t i, j;
1061 
1062     /* Shift boolean to the sign bit so we can xor to negate.  */
1063     neg_real <<= 31;
1064     neg_imag <<= 31;
1065 
1066     for (i = 0; i < elements; i += eltspersegment) {
1067         float32 mr = m[H4(i + 2 * index + 0)];
1068         float32 mi = m[H4(i + 2 * index + 1)];
1069         float32 e1 = neg_real ^ (flip ? mi : mr);
1070         float32 e3 = neg_imag ^ (flip ? mr : mi);
1071 
1072         for (j = i; j < i + eltspersegment; j += 2) {
1073             float32 e2 = n[H4(j + flip)];
1074             float32 e4 = e2;
1075 
1076             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1077             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1078         }
1079     }
1080     clear_tail(d, opr_sz, simd_maxsz(desc));
1081 }
1082 
1083 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1084                          void *vfpst, uint32_t desc)
1085 {
1086     uintptr_t opr_sz = simd_oprsz(desc);
1087     float64 *d = vd, *n = vn, *m = vm, *a = va;
1088     float_status *fpst = vfpst;
1089     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1090     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1091     uint64_t neg_real = flip ^ neg_imag;
1092     uintptr_t i;
1093 
1094     /* Shift boolean to the sign bit so we can xor to negate.  */
1095     neg_real <<= 63;
1096     neg_imag <<= 63;
1097 
1098     for (i = 0; i < opr_sz / 8; i += 2) {
1099         float64 e2 = n[i + flip];
1100         float64 e1 = m[i + flip] ^ neg_real;
1101         float64 e4 = e2;
1102         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1103 
1104         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1105         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1106     }
1107     clear_tail(d, opr_sz, simd_maxsz(desc));
1108 }
1109 
1110 /*
1111  * Floating point comparisons producing an integer result (all 1s or all 0s).
1112  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1113  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1114  */
1115 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1116 {
1117     return -float16_eq_quiet(op1, op2, stat);
1118 }
1119 
1120 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1121 {
1122     return -float32_eq_quiet(op1, op2, stat);
1123 }
1124 
1125 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1126 {
1127     return -float64_eq_quiet(op1, op2, stat);
1128 }
1129 
1130 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1131 {
1132     return -float16_le(op2, op1, stat);
1133 }
1134 
1135 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1136 {
1137     return -float32_le(op2, op1, stat);
1138 }
1139 
1140 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1141 {
1142     return -float64_le(op2, op1, stat);
1143 }
1144 
1145 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1146 {
1147     return -float16_lt(op2, op1, stat);
1148 }
1149 
1150 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1151 {
1152     return -float32_lt(op2, op1, stat);
1153 }
1154 
1155 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1156 {
1157     return -float64_lt(op2, op1, stat);
1158 }
1159 
1160 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1161 {
1162     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1163 }
1164 
1165 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1166 {
1167     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1168 }
1169 
1170 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1171 {
1172     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1173 }
1174 
1175 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1176 {
1177     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1178 }
1179 
1180 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1181 {
1182     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1183 }
1184 
1185 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1186 {
1187     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1188 }
1189 
1190 static int16_t vfp_tosszh(float16 x, void *fpstp)
1191 {
1192     float_status *fpst = fpstp;
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_int16_round_to_zero(x, fpst);
1198 }
1199 
1200 static uint16_t vfp_touszh(float16 x, void *fpstp)
1201 {
1202     float_status *fpst = fpstp;
1203     if (float16_is_any_nan(x)) {
1204         float_raise(float_flag_invalid, fpst);
1205         return 0;
1206     }
1207     return float16_to_uint16_round_to_zero(x, fpst);
1208 }
1209 
1210 #define DO_2OP(NAME, FUNC, TYPE) \
1211 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
1212 {                                                                 \
1213     intptr_t i, oprsz = simd_oprsz(desc);                         \
1214     TYPE *d = vd, *n = vn;                                        \
1215     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1216         d[i] = FUNC(n[i], stat);                                  \
1217     }                                                             \
1218     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1219 }
1220 
1221 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1222 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1223 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1224 
1225 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1226 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1227 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1228 
1229 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1230 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1231 
1232 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1233 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1234 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1235 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1236 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1237 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1238 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1239 DO_2OP(gvec_touszh, vfp_touszh, float16)
1240 
1241 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1242     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1243     {                                                           \
1244         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1245     }
1246 
1247 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1248     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1249     {                                                           \
1250         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1251     }
1252 
1253 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1254     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1255     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1256     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1257     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1258 
1259 DO_2OP_CMP0(cgt, cgt, FWD)
1260 DO_2OP_CMP0(cge, cge, FWD)
1261 DO_2OP_CMP0(ceq, ceq, FWD)
1262 DO_2OP_CMP0(clt, cgt, REV)
1263 DO_2OP_CMP0(cle, cge, REV)
1264 
1265 #undef DO_2OP
1266 #undef DO_2OP_CMP0
1267 
1268 /* Floating-point trigonometric starting value.
1269  * See the ARM ARM pseudocode function FPTrigSMul.
1270  */
1271 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1272 {
1273     float16 result = float16_mul(op1, op1, stat);
1274     if (!float16_is_any_nan(result)) {
1275         result = float16_set_sign(result, op2 & 1);
1276     }
1277     return result;
1278 }
1279 
1280 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1281 {
1282     float32 result = float32_mul(op1, op1, stat);
1283     if (!float32_is_any_nan(result)) {
1284         result = float32_set_sign(result, op2 & 1);
1285     }
1286     return result;
1287 }
1288 
1289 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1290 {
1291     float64 result = float64_mul(op1, op1, stat);
1292     if (!float64_is_any_nan(result)) {
1293         result = float64_set_sign(result, op2 & 1);
1294     }
1295     return result;
1296 }
1297 
1298 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1299 {
1300     return float16_abs(float16_sub(op1, op2, stat));
1301 }
1302 
1303 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1304 {
1305     return float32_abs(float32_sub(op1, op2, stat));
1306 }
1307 
1308 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1309 {
1310     return float64_abs(float64_sub(op1, op2, stat));
1311 }
1312 
1313 /*
1314  * Reciprocal step. These are the AArch32 version which uses a
1315  * non-fused multiply-and-subtract.
1316  */
1317 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1318 {
1319     op1 = float16_squash_input_denormal(op1, stat);
1320     op2 = float16_squash_input_denormal(op2, stat);
1321 
1322     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1323         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1324         return float16_two;
1325     }
1326     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1327 }
1328 
1329 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1330 {
1331     op1 = float32_squash_input_denormal(op1, stat);
1332     op2 = float32_squash_input_denormal(op2, stat);
1333 
1334     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1335         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1336         return float32_two;
1337     }
1338     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1339 }
1340 
1341 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1342 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1343 {
1344     op1 = float16_squash_input_denormal(op1, stat);
1345     op2 = float16_squash_input_denormal(op2, stat);
1346 
1347     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1348         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1349         return float16_one_point_five;
1350     }
1351     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1352     return float16_div(op1, float16_two, stat);
1353 }
1354 
1355 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1356 {
1357     op1 = float32_squash_input_denormal(op1, stat);
1358     op2 = float32_squash_input_denormal(op2, stat);
1359 
1360     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1361         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1362         return float32_one_point_five;
1363     }
1364     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1365     return float32_div(op1, float32_two, stat);
1366 }
1367 
1368 #define DO_3OP(NAME, FUNC, TYPE) \
1369 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1370 {                                                                          \
1371     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1372     TYPE *d = vd, *n = vn, *m = vm;                                        \
1373     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1374         d[i] = FUNC(n[i], m[i], stat);                                     \
1375     }                                                                      \
1376     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1377 }
1378 
1379 DO_3OP(gvec_fadd_h, float16_add, float16)
1380 DO_3OP(gvec_fadd_s, float32_add, float32)
1381 DO_3OP(gvec_fadd_d, float64_add, float64)
1382 
1383 DO_3OP(gvec_fsub_h, float16_sub, float16)
1384 DO_3OP(gvec_fsub_s, float32_sub, float32)
1385 DO_3OP(gvec_fsub_d, float64_sub, float64)
1386 
1387 DO_3OP(gvec_fmul_h, float16_mul, float16)
1388 DO_3OP(gvec_fmul_s, float32_mul, float32)
1389 DO_3OP(gvec_fmul_d, float64_mul, float64)
1390 
1391 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1392 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1393 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1394 
1395 DO_3OP(gvec_fabd_h, float16_abd, float16)
1396 DO_3OP(gvec_fabd_s, float32_abd, float32)
1397 DO_3OP(gvec_fabd_d, float64_abd, float64)
1398 
1399 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1400 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1401 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1402 
1403 DO_3OP(gvec_fcge_h, float16_cge, float16)
1404 DO_3OP(gvec_fcge_s, float32_cge, float32)
1405 DO_3OP(gvec_fcge_d, float64_cge, float64)
1406 
1407 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1408 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1409 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1410 
1411 DO_3OP(gvec_facge_h, float16_acge, float16)
1412 DO_3OP(gvec_facge_s, float32_acge, float32)
1413 DO_3OP(gvec_facge_d, float64_acge, float64)
1414 
1415 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1416 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1417 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1418 
1419 DO_3OP(gvec_fmax_h, float16_max, float16)
1420 DO_3OP(gvec_fmax_s, float32_max, float32)
1421 DO_3OP(gvec_fmax_d, float64_max, float64)
1422 
1423 DO_3OP(gvec_fmin_h, float16_min, float16)
1424 DO_3OP(gvec_fmin_s, float32_min, float32)
1425 DO_3OP(gvec_fmin_d, float64_min, float64)
1426 
1427 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1428 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1429 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1430 
1431 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1432 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1433 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1434 
1435 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1436 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1437 
1438 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1439 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1440 
1441 #ifdef TARGET_AARCH64
1442 DO_3OP(gvec_fdiv_h, float16_div, float16)
1443 DO_3OP(gvec_fdiv_s, float32_div, float32)
1444 DO_3OP(gvec_fdiv_d, float64_div, float64)
1445 
1446 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1447 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1448 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1449 
1450 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1451 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1452 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1453 
1454 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1455 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1456 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1457 
1458 #endif
1459 #undef DO_3OP
1460 
1461 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1462 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1463                                  float_status *stat)
1464 {
1465     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1466 }
1467 
1468 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1469                                  float_status *stat)
1470 {
1471     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1472 }
1473 
1474 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1475                                  float_status *stat)
1476 {
1477     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1478 }
1479 
1480 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1481                                  float_status *stat)
1482 {
1483     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1484 }
1485 
1486 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1487 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1488                                 float_status *stat)
1489 {
1490     return float16_muladd(op1, op2, dest, 0, stat);
1491 }
1492 
1493 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1494                                  float_status *stat)
1495 {
1496     return float32_muladd(op1, op2, dest, 0, stat);
1497 }
1498 
1499 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1500                                  float_status *stat)
1501 {
1502     return float64_muladd(op1, op2, dest, 0, stat);
1503 }
1504 
1505 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1506                                  float_status *stat)
1507 {
1508     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1509 }
1510 
1511 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1512                                  float_status *stat)
1513 {
1514     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1515 }
1516 
1517 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1518                                  float_status *stat)
1519 {
1520     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1521 }
1522 
1523 #define DO_MULADD(NAME, FUNC, TYPE)                                     \
1524 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1525 {                                                                          \
1526     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1527     TYPE *d = vd, *n = vn, *m = vm;                                        \
1528     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1529         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1530     }                                                                      \
1531     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1532 }
1533 
1534 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1535 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1536 
1537 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1538 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1539 
1540 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1541 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1542 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1543 
1544 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1545 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1546 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1547 
1548 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1549  * For AdvSIMD, there is of course only one such vector segment.
1550  */
1551 
1552 #define DO_MUL_IDX(NAME, TYPE, H) \
1553 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1554 {                                                                          \
1555     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1556     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1557     intptr_t idx = simd_data(desc);                                        \
1558     TYPE *d = vd, *n = vn, *m = vm;                                        \
1559     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1560         TYPE mm = m[H(i + idx)];                                           \
1561         for (j = 0; j < segment; j++) {                                    \
1562             d[i + j] = n[i + j] * mm;                                      \
1563         }                                                                  \
1564     }                                                                      \
1565     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1566 }
1567 
1568 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1569 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1570 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1571 
1572 #undef DO_MUL_IDX
1573 
1574 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1575 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1576 {                                                                          \
1577     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1578     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1579     intptr_t idx = simd_data(desc);                                        \
1580     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1581     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1582         TYPE mm = m[H(i + idx)];                                           \
1583         for (j = 0; j < segment; j++) {                                    \
1584             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1585         }                                                                  \
1586     }                                                                      \
1587     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1588 }
1589 
1590 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1591 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1592 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1593 
1594 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1595 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1596 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1597 
1598 #undef DO_MLA_IDX
1599 
1600 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1601 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1602 {                                                                          \
1603     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1604     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1605     intptr_t idx = simd_data(desc);                                        \
1606     TYPE *d = vd, *n = vn, *m = vm;                                        \
1607     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1608         TYPE mm = m[H(i + idx)];                                           \
1609         for (j = 0; j < segment; j++) {                                    \
1610             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1611         }                                                                  \
1612     }                                                                      \
1613     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1614 }
1615 
1616 #define nop(N, M, S) (M)
1617 
1618 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1619 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1620 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1621 
1622 #ifdef TARGET_AARCH64
1623 
1624 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1625 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1626 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1627 
1628 #endif
1629 
1630 #undef nop
1631 
1632 /*
1633  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1634  * the fused ops below they assume accumulate both from and into Vd.
1635  */
1636 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1637 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1638 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1639 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1640 
1641 #undef DO_FMUL_IDX
1642 
1643 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1644 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1645                   void *stat, uint32_t desc)                               \
1646 {                                                                          \
1647     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1648     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1649     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1650     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1651     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1652     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1653     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1654         TYPE mm = m[H(i + idx)];                                           \
1655         for (j = 0; j < segment; j++) {                                    \
1656             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1657                                      mm, a[i + j], 0, stat);               \
1658         }                                                                  \
1659     }                                                                      \
1660     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1661 }
1662 
1663 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1664 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1665 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1666 
1667 #undef DO_FMLA_IDX
1668 
1669 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1670 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1671 {                                                                          \
1672     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1673     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1674     bool q = false;                                                        \
1675     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1676         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1677         if (dd < MIN) {                                                    \
1678             dd = MIN;                                                      \
1679             q = true;                                                      \
1680         } else if (dd > MAX) {                                             \
1681             dd = MAX;                                                      \
1682             q = true;                                                      \
1683         }                                                                  \
1684         d[i] = dd;                                                         \
1685     }                                                                      \
1686     if (q) {                                                               \
1687         uint32_t *qc = vq;                                                 \
1688         qc[0] = 1;                                                         \
1689     }                                                                      \
1690     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1691 }
1692 
1693 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1694 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1695 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1696 
1697 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1698 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1699 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1700 
1701 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1702 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1703 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1704 
1705 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1706 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1707 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1708 
1709 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1710 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1711 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1712 
1713 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1714 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1715 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1716 
1717 #undef DO_SAT
1718 
1719 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1720                           void *vm, uint32_t desc)
1721 {
1722     intptr_t i, oprsz = simd_oprsz(desc);
1723     uint64_t *d = vd, *n = vn, *m = vm;
1724     bool q = false;
1725 
1726     for (i = 0; i < oprsz / 8; i++) {
1727         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1728         if (dd < nn) {
1729             dd = UINT64_MAX;
1730             q = true;
1731         }
1732         d[i] = dd;
1733     }
1734     if (q) {
1735         uint32_t *qc = vq;
1736         qc[0] = 1;
1737     }
1738     clear_tail(d, oprsz, simd_maxsz(desc));
1739 }
1740 
1741 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1742                           void *vm, uint32_t desc)
1743 {
1744     intptr_t i, oprsz = simd_oprsz(desc);
1745     uint64_t *d = vd, *n = vn, *m = vm;
1746     bool q = false;
1747 
1748     for (i = 0; i < oprsz / 8; i++) {
1749         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1750         if (nn < mm) {
1751             dd = 0;
1752             q = true;
1753         }
1754         d[i] = dd;
1755     }
1756     if (q) {
1757         uint32_t *qc = vq;
1758         qc[0] = 1;
1759     }
1760     clear_tail(d, oprsz, simd_maxsz(desc));
1761 }
1762 
1763 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1764                           void *vm, uint32_t desc)
1765 {
1766     intptr_t i, oprsz = simd_oprsz(desc);
1767     int64_t *d = vd, *n = vn, *m = vm;
1768     bool q = false;
1769 
1770     for (i = 0; i < oprsz / 8; i++) {
1771         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1772         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1773             dd = (nn >> 63) ^ ~INT64_MIN;
1774             q = true;
1775         }
1776         d[i] = dd;
1777     }
1778     if (q) {
1779         uint32_t *qc = vq;
1780         qc[0] = 1;
1781     }
1782     clear_tail(d, oprsz, simd_maxsz(desc));
1783 }
1784 
1785 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1786                           void *vm, uint32_t desc)
1787 {
1788     intptr_t i, oprsz = simd_oprsz(desc);
1789     int64_t *d = vd, *n = vn, *m = vm;
1790     bool q = false;
1791 
1792     for (i = 0; i < oprsz / 8; i++) {
1793         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1794         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1795             dd = (nn >> 63) ^ ~INT64_MIN;
1796             q = true;
1797         }
1798         d[i] = dd;
1799     }
1800     if (q) {
1801         uint32_t *qc = vq;
1802         qc[0] = 1;
1803     }
1804     clear_tail(d, oprsz, simd_maxsz(desc));
1805 }
1806 
1807 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1808                            void *vm, uint32_t desc)
1809 {
1810     intptr_t i, oprsz = simd_oprsz(desc);
1811     uint64_t *d = vd, *n = vn, *m = vm;
1812     bool q = false;
1813 
1814     for (i = 0; i < oprsz / 8; i++) {
1815         uint64_t nn = n[i];
1816         int64_t mm = m[i];
1817         uint64_t dd = nn + mm;
1818 
1819         if (mm < 0) {
1820             if (nn < (uint64_t)-mm) {
1821                 dd = 0;
1822                 q = true;
1823             }
1824         } else {
1825             if (dd < nn) {
1826                 dd = UINT64_MAX;
1827                 q = true;
1828             }
1829         }
1830         d[i] = dd;
1831     }
1832     if (q) {
1833         uint32_t *qc = vq;
1834         qc[0] = 1;
1835     }
1836     clear_tail(d, oprsz, simd_maxsz(desc));
1837 }
1838 
1839 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1840                            void *vm, uint32_t desc)
1841 {
1842     intptr_t i, oprsz = simd_oprsz(desc);
1843     uint64_t *d = vd, *n = vn, *m = vm;
1844     bool q = false;
1845 
1846     for (i = 0; i < oprsz / 8; i++) {
1847         int64_t nn = n[i];
1848         uint64_t mm = m[i];
1849         int64_t dd = nn + mm;
1850 
1851         if (mm > (uint64_t)(INT64_MAX - nn)) {
1852             dd = INT64_MAX;
1853             q = true;
1854         }
1855         d[i] = dd;
1856     }
1857     if (q) {
1858         uint32_t *qc = vq;
1859         qc[0] = 1;
1860     }
1861     clear_tail(d, oprsz, simd_maxsz(desc));
1862 }
1863 
1864 #define DO_SRA(NAME, TYPE)                              \
1865 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1866 {                                                       \
1867     intptr_t i, oprsz = simd_oprsz(desc);               \
1868     int shift = simd_data(desc);                        \
1869     TYPE *d = vd, *n = vn;                              \
1870     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1871         d[i] += n[i] >> shift;                          \
1872     }                                                   \
1873     clear_tail(d, oprsz, simd_maxsz(desc));             \
1874 }
1875 
1876 DO_SRA(gvec_ssra_b, int8_t)
1877 DO_SRA(gvec_ssra_h, int16_t)
1878 DO_SRA(gvec_ssra_s, int32_t)
1879 DO_SRA(gvec_ssra_d, int64_t)
1880 
1881 DO_SRA(gvec_usra_b, uint8_t)
1882 DO_SRA(gvec_usra_h, uint16_t)
1883 DO_SRA(gvec_usra_s, uint32_t)
1884 DO_SRA(gvec_usra_d, uint64_t)
1885 
1886 #undef DO_SRA
1887 
1888 #define DO_RSHR(NAME, TYPE)                             \
1889 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1890 {                                                       \
1891     intptr_t i, oprsz = simd_oprsz(desc);               \
1892     int shift = simd_data(desc);                        \
1893     TYPE *d = vd, *n = vn;                              \
1894     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1895         TYPE tmp = n[i] >> (shift - 1);                 \
1896         d[i] = (tmp >> 1) + (tmp & 1);                  \
1897     }                                                   \
1898     clear_tail(d, oprsz, simd_maxsz(desc));             \
1899 }
1900 
1901 DO_RSHR(gvec_srshr_b, int8_t)
1902 DO_RSHR(gvec_srshr_h, int16_t)
1903 DO_RSHR(gvec_srshr_s, int32_t)
1904 DO_RSHR(gvec_srshr_d, int64_t)
1905 
1906 DO_RSHR(gvec_urshr_b, uint8_t)
1907 DO_RSHR(gvec_urshr_h, uint16_t)
1908 DO_RSHR(gvec_urshr_s, uint32_t)
1909 DO_RSHR(gvec_urshr_d, uint64_t)
1910 
1911 #undef DO_RSHR
1912 
1913 #define DO_RSRA(NAME, TYPE)                             \
1914 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1915 {                                                       \
1916     intptr_t i, oprsz = simd_oprsz(desc);               \
1917     int shift = simd_data(desc);                        \
1918     TYPE *d = vd, *n = vn;                              \
1919     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1920         TYPE tmp = n[i] >> (shift - 1);                 \
1921         d[i] += (tmp >> 1) + (tmp & 1);                 \
1922     }                                                   \
1923     clear_tail(d, oprsz, simd_maxsz(desc));             \
1924 }
1925 
1926 DO_RSRA(gvec_srsra_b, int8_t)
1927 DO_RSRA(gvec_srsra_h, int16_t)
1928 DO_RSRA(gvec_srsra_s, int32_t)
1929 DO_RSRA(gvec_srsra_d, int64_t)
1930 
1931 DO_RSRA(gvec_ursra_b, uint8_t)
1932 DO_RSRA(gvec_ursra_h, uint16_t)
1933 DO_RSRA(gvec_ursra_s, uint32_t)
1934 DO_RSRA(gvec_ursra_d, uint64_t)
1935 
1936 #undef DO_RSRA
1937 
1938 #define DO_SRI(NAME, TYPE)                              \
1939 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1940 {                                                       \
1941     intptr_t i, oprsz = simd_oprsz(desc);               \
1942     int shift = simd_data(desc);                        \
1943     TYPE *d = vd, *n = vn;                              \
1944     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1945         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1946     }                                                   \
1947     clear_tail(d, oprsz, simd_maxsz(desc));             \
1948 }
1949 
1950 DO_SRI(gvec_sri_b, uint8_t)
1951 DO_SRI(gvec_sri_h, uint16_t)
1952 DO_SRI(gvec_sri_s, uint32_t)
1953 DO_SRI(gvec_sri_d, uint64_t)
1954 
1955 #undef DO_SRI
1956 
1957 #define DO_SLI(NAME, TYPE)                              \
1958 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1959 {                                                       \
1960     intptr_t i, oprsz = simd_oprsz(desc);               \
1961     int shift = simd_data(desc);                        \
1962     TYPE *d = vd, *n = vn;                              \
1963     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1964         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1965     }                                                   \
1966     clear_tail(d, oprsz, simd_maxsz(desc));             \
1967 }
1968 
1969 DO_SLI(gvec_sli_b, uint8_t)
1970 DO_SLI(gvec_sli_h, uint16_t)
1971 DO_SLI(gvec_sli_s, uint32_t)
1972 DO_SLI(gvec_sli_d, uint64_t)
1973 
1974 #undef DO_SLI
1975 
1976 /*
1977  * Convert float16 to float32, raising no exceptions and
1978  * preserving exceptional values, including SNaN.
1979  * This is effectively an unpack+repack operation.
1980  */
1981 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1982 {
1983     const int f16_bias = 15;
1984     const int f32_bias = 127;
1985     uint32_t sign = extract32(f16, 15, 1);
1986     uint32_t exp = extract32(f16, 10, 5);
1987     uint32_t frac = extract32(f16, 0, 10);
1988 
1989     if (exp == 0x1f) {
1990         /* Inf or NaN */
1991         exp = 0xff;
1992     } else if (exp == 0) {
1993         /* Zero or denormal.  */
1994         if (frac != 0) {
1995             if (fz16) {
1996                 frac = 0;
1997             } else {
1998                 /*
1999                  * Denormal; these are all normal float32.
2000                  * Shift the fraction so that the msb is at bit 11,
2001                  * then remove bit 11 as the implicit bit of the
2002                  * normalized float32.  Note that we still go through
2003                  * the shift for normal numbers below, to put the
2004                  * float32 fraction at the right place.
2005                  */
2006                 int shift = clz32(frac) - 21;
2007                 frac = (frac << shift) & 0x3ff;
2008                 exp = f32_bias - f16_bias - shift + 1;
2009             }
2010         }
2011     } else {
2012         /* Normal number; adjust the bias.  */
2013         exp += f32_bias - f16_bias;
2014     }
2015     sign <<= 31;
2016     exp <<= 23;
2017     frac <<= 23 - 10;
2018 
2019     return sign | exp | frac;
2020 }
2021 
2022 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2023 {
2024     /*
2025      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2026      * Load the 2nd qword iff is_q & is_2.
2027      * Shift to the 2nd dword iff !is_q & is_2.
2028      * For !is_q & !is_2, the upper bits of the result are garbage.
2029      */
2030     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2031 }
2032 
2033 /*
2034  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2035  * as there is not yet SVE versions that might use blocking.
2036  */
2037 
2038 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2039                      uint32_t desc, bool fz16)
2040 {
2041     intptr_t i, oprsz = simd_oprsz(desc);
2042     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2043     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2044     int is_q = oprsz == 16;
2045     uint64_t n_4, m_4;
2046 
2047     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2048     n_4 = load4_f16(vn, is_q, is_2);
2049     m_4 = load4_f16(vm, is_q, is_2);
2050 
2051     /* Negate all inputs for FMLSL at once.  */
2052     if (is_s) {
2053         n_4 ^= 0x8000800080008000ull;
2054     }
2055 
2056     for (i = 0; i < oprsz / 4; i++) {
2057         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2058         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2059         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2060     }
2061     clear_tail(d, oprsz, simd_maxsz(desc));
2062 }
2063 
2064 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2065                             void *venv, uint32_t desc)
2066 {
2067     CPUARMState *env = venv;
2068     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2069              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2070 }
2071 
2072 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2073                             void *venv, uint32_t desc)
2074 {
2075     CPUARMState *env = venv;
2076     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
2077              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2078 }
2079 
2080 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2081                                void *venv, uint32_t desc)
2082 {
2083     intptr_t i, oprsz = simd_oprsz(desc);
2084     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2085     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2086     CPUARMState *env = venv;
2087     float_status *status = &env->vfp.fp_status;
2088     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2089 
2090     for (i = 0; i < oprsz; i += sizeof(float32)) {
2091         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2092         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2093         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2094         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2095         float32 aa = *(float32 *)(va + H1_4(i));
2096 
2097         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2098     }
2099 }
2100 
2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2102                          uint32_t desc, bool fz16)
2103 {
2104     intptr_t i, oprsz = simd_oprsz(desc);
2105     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2106     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2107     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2108     int is_q = oprsz == 16;
2109     uint64_t n_4;
2110     float32 m_1;
2111 
2112     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2113     n_4 = load4_f16(vn, is_q, is_2);
2114 
2115     /* Negate all inputs for FMLSL at once.  */
2116     if (is_s) {
2117         n_4 ^= 0x8000800080008000ull;
2118     }
2119 
2120     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2121 
2122     for (i = 0; i < oprsz / 4; i++) {
2123         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2124         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2125     }
2126     clear_tail(d, oprsz, simd_maxsz(desc));
2127 }
2128 
2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2130                                 void *venv, uint32_t desc)
2131 {
2132     CPUARMState *env = venv;
2133     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2134                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2135 }
2136 
2137 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2138                                 void *venv, uint32_t desc)
2139 {
2140     CPUARMState *env = venv;
2141     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
2142                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
2143 }
2144 
2145 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2146                                void *venv, uint32_t desc)
2147 {
2148     intptr_t i, j, oprsz = simd_oprsz(desc);
2149     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2150     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2151     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2152     CPUARMState *env = venv;
2153     float_status *status = &env->vfp.fp_status;
2154     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
2155 
2156     for (i = 0; i < oprsz; i += 16) {
2157         float16 mm_16 = *(float16 *)(vm + i + idx);
2158         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2159 
2160         for (j = 0; j < 16; j += sizeof(float32)) {
2161             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2162             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2163             float32 aa = *(float32 *)(va + H1_4(i + j));
2164 
2165             *(float32 *)(vd + H1_4(i + j)) =
2166                 float32_muladd(nn, mm, aa, 0, status);
2167         }
2168     }
2169 }
2170 
2171 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2172 {
2173     intptr_t i, opr_sz = simd_oprsz(desc);
2174     int8_t *d = vd, *n = vn, *m = vm;
2175 
2176     for (i = 0; i < opr_sz; ++i) {
2177         int8_t mm = m[i];
2178         int8_t nn = n[i];
2179         int8_t res = 0;
2180         if (mm >= 0) {
2181             if (mm < 8) {
2182                 res = nn << mm;
2183             }
2184         } else {
2185             res = nn >> (mm > -8 ? -mm : 7);
2186         }
2187         d[i] = res;
2188     }
2189     clear_tail(d, opr_sz, simd_maxsz(desc));
2190 }
2191 
2192 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2193 {
2194     intptr_t i, opr_sz = simd_oprsz(desc);
2195     int16_t *d = vd, *n = vn, *m = vm;
2196 
2197     for (i = 0; i < opr_sz / 2; ++i) {
2198         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2199         int16_t nn = n[i];
2200         int16_t res = 0;
2201         if (mm >= 0) {
2202             if (mm < 16) {
2203                 res = nn << mm;
2204             }
2205         } else {
2206             res = nn >> (mm > -16 ? -mm : 15);
2207         }
2208         d[i] = res;
2209     }
2210     clear_tail(d, opr_sz, simd_maxsz(desc));
2211 }
2212 
2213 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2214 {
2215     intptr_t i, opr_sz = simd_oprsz(desc);
2216     uint8_t *d = vd, *n = vn, *m = vm;
2217 
2218     for (i = 0; i < opr_sz; ++i) {
2219         int8_t mm = m[i];
2220         uint8_t nn = n[i];
2221         uint8_t res = 0;
2222         if (mm >= 0) {
2223             if (mm < 8) {
2224                 res = nn << mm;
2225             }
2226         } else {
2227             if (mm > -8) {
2228                 res = nn >> -mm;
2229             }
2230         }
2231         d[i] = res;
2232     }
2233     clear_tail(d, opr_sz, simd_maxsz(desc));
2234 }
2235 
2236 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2237 {
2238     intptr_t i, opr_sz = simd_oprsz(desc);
2239     uint16_t *d = vd, *n = vn, *m = vm;
2240 
2241     for (i = 0; i < opr_sz / 2; ++i) {
2242         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2243         uint16_t nn = n[i];
2244         uint16_t res = 0;
2245         if (mm >= 0) {
2246             if (mm < 16) {
2247                 res = nn << mm;
2248             }
2249         } else {
2250             if (mm > -16) {
2251                 res = nn >> -mm;
2252             }
2253         }
2254         d[i] = res;
2255     }
2256     clear_tail(d, opr_sz, simd_maxsz(desc));
2257 }
2258 
2259 /*
2260  * 8x8->8 polynomial multiply.
2261  *
2262  * Polynomial multiplication is like integer multiplication except the
2263  * partial products are XORed, not added.
2264  *
2265  * TODO: expose this as a generic vector operation, as it is a common
2266  * crypto building block.
2267  */
2268 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2269 {
2270     intptr_t i, opr_sz = simd_oprsz(desc);
2271     uint64_t *d = vd, *n = vn, *m = vm;
2272 
2273     for (i = 0; i < opr_sz / 8; ++i) {
2274         d[i] = clmul_8x8_low(n[i], m[i]);
2275     }
2276     clear_tail(d, opr_sz, simd_maxsz(desc));
2277 }
2278 
2279 /*
2280  * 64x64->128 polynomial multiply.
2281  * Because of the lanes are not accessed in strict columns,
2282  * this probably cannot be turned into a generic helper.
2283  */
2284 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2285 {
2286     intptr_t i, opr_sz = simd_oprsz(desc);
2287     intptr_t hi = simd_data(desc);
2288     uint64_t *d = vd, *n = vn, *m = vm;
2289 
2290     for (i = 0; i < opr_sz / 8; i += 2) {
2291         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2292         d[i] = int128_getlo(r);
2293         d[i + 1] = int128_gethi(r);
2294     }
2295     clear_tail(d, opr_sz, simd_maxsz(desc));
2296 }
2297 
2298 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2299 {
2300     int hi = simd_data(desc);
2301     uint64_t *d = vd, *n = vn, *m = vm;
2302     uint64_t nn = n[hi], mm = m[hi];
2303 
2304     d[0] = clmul_8x4_packed(nn, mm);
2305     nn >>= 32;
2306     mm >>= 32;
2307     d[1] = clmul_8x4_packed(nn, mm);
2308 
2309     clear_tail(d, 16, simd_maxsz(desc));
2310 }
2311 
2312 #ifdef TARGET_AARCH64
2313 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2314 {
2315     int shift = simd_data(desc) * 8;
2316     intptr_t i, opr_sz = simd_oprsz(desc);
2317     uint64_t *d = vd, *n = vn, *m = vm;
2318 
2319     for (i = 0; i < opr_sz / 8; ++i) {
2320         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2321     }
2322 }
2323 
2324 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2325 {
2326     intptr_t sel = H4(simd_data(desc));
2327     intptr_t i, opr_sz = simd_oprsz(desc);
2328     uint32_t *n = vn, *m = vm;
2329     uint64_t *d = vd;
2330 
2331     for (i = 0; i < opr_sz / 8; ++i) {
2332         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2333     }
2334 }
2335 #endif
2336 
2337 #define DO_CMP0(NAME, TYPE, OP)                         \
2338 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2339 {                                                       \
2340     intptr_t i, opr_sz = simd_oprsz(desc);              \
2341     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2342         TYPE nn = *(TYPE *)(vn + i);                    \
2343         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2344     }                                                   \
2345     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2346 }
2347 
2348 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2349 DO_CMP0(gvec_clt0_b, int8_t, <)
2350 DO_CMP0(gvec_cle0_b, int8_t, <=)
2351 DO_CMP0(gvec_cgt0_b, int8_t, >)
2352 DO_CMP0(gvec_cge0_b, int8_t, >=)
2353 
2354 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2355 DO_CMP0(gvec_clt0_h, int16_t, <)
2356 DO_CMP0(gvec_cle0_h, int16_t, <=)
2357 DO_CMP0(gvec_cgt0_h, int16_t, >)
2358 DO_CMP0(gvec_cge0_h, int16_t, >=)
2359 
2360 #undef DO_CMP0
2361 
2362 #define DO_ABD(NAME, TYPE)                                      \
2363 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2364 {                                                               \
2365     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2366     TYPE *d = vd, *n = vn, *m = vm;                             \
2367                                                                 \
2368     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2369         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2370     }                                                           \
2371     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2372 }
2373 
2374 DO_ABD(gvec_sabd_b, int8_t)
2375 DO_ABD(gvec_sabd_h, int16_t)
2376 DO_ABD(gvec_sabd_s, int32_t)
2377 DO_ABD(gvec_sabd_d, int64_t)
2378 
2379 DO_ABD(gvec_uabd_b, uint8_t)
2380 DO_ABD(gvec_uabd_h, uint16_t)
2381 DO_ABD(gvec_uabd_s, uint32_t)
2382 DO_ABD(gvec_uabd_d, uint64_t)
2383 
2384 #undef DO_ABD
2385 
2386 #define DO_ABA(NAME, TYPE)                                      \
2387 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2388 {                                                               \
2389     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2390     TYPE *d = vd, *n = vn, *m = vm;                             \
2391                                                                 \
2392     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2393         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2394     }                                                           \
2395     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2396 }
2397 
2398 DO_ABA(gvec_saba_b, int8_t)
2399 DO_ABA(gvec_saba_h, int16_t)
2400 DO_ABA(gvec_saba_s, int32_t)
2401 DO_ABA(gvec_saba_d, int64_t)
2402 
2403 DO_ABA(gvec_uaba_b, uint8_t)
2404 DO_ABA(gvec_uaba_h, uint16_t)
2405 DO_ABA(gvec_uaba_s, uint32_t)
2406 DO_ABA(gvec_uaba_d, uint64_t)
2407 
2408 #undef DO_ABA
2409 
2410 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2411 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
2412 {                                                                          \
2413     ARMVectorReg scratch;                                                  \
2414     intptr_t oprsz = simd_oprsz(desc);                                     \
2415     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2416     TYPE *d = vd, *n = vn, *m = vm;                                        \
2417     if (unlikely(d == m)) {                                                \
2418         m = memcpy(&scratch, m, oprsz);                                    \
2419     }                                                                      \
2420     for (intptr_t i = 0; i < half; ++i) {                                  \
2421         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2422     }                                                                      \
2423     for (intptr_t i = 0; i < half; ++i) {                                  \
2424         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2425     }                                                                      \
2426     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2427 }
2428 
2429 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2430 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2431 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2432 
2433 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2434 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2435 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2436 
2437 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2438 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2439 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2440 
2441 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2442 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2443 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2444 
2445 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2446 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2447 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2448 
2449 #undef DO_3OP_PAIR
2450 
2451 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2452 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2453 {                                                               \
2454     ARMVectorReg scratch;                                       \
2455     intptr_t oprsz = simd_oprsz(desc);                          \
2456     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2457     TYPE *d = vd, *n = vn, *m = vm;                             \
2458     if (unlikely(d == m)) {                                     \
2459         m = memcpy(&scratch, m, oprsz);                         \
2460     }                                                           \
2461     for (intptr_t i = 0; i < half; ++i) {                       \
2462         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2463     }                                                           \
2464     for (intptr_t i = 0; i < half; ++i) {                       \
2465         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2466     }                                                           \
2467     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2468 }
2469 
2470 #define ADD(A, B) (A + B)
2471 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2472 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2473 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2474 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2475 #undef  ADD
2476 
2477 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2478 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2479 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2480 
2481 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2482 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2483 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2484 
2485 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2486 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2487 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2488 
2489 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2490 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2491 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2492 
2493 #undef DO_3OP_PAIR
2494 
2495 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2496     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2497     {                                                                   \
2498         intptr_t i, oprsz = simd_oprsz(desc);                           \
2499         int shift = simd_data(desc);                                    \
2500         TYPE *d = vd, *n = vn;                                          \
2501         float_status *fpst = stat;                                      \
2502         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2503             d[i] = FUNC(n[i], shift, fpst);                             \
2504         }                                                               \
2505         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2506     }
2507 
2508 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2509 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2510 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2511 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2512 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2513 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2514 
2515 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2516 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2517 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2518 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2519 
2520 #undef DO_VCVT_FIXED
2521 
2522 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2523     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2524     {                                                                   \
2525         float_status *fpst = stat;                                      \
2526         intptr_t i, oprsz = simd_oprsz(desc);                           \
2527         uint32_t rmode = simd_data(desc);                               \
2528         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2529         TYPE *d = vd, *n = vn;                                          \
2530         set_float_rounding_mode(rmode, fpst);                           \
2531         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2532             d[i] = FUNC(n[i], 0, fpst);                                 \
2533         }                                                               \
2534         set_float_rounding_mode(prev_rmode, fpst);                      \
2535         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2536     }
2537 
2538 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2539 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2540 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2541 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2542 
2543 #undef DO_VCVT_RMODE
2544 
2545 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2546     void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)    \
2547     {                                                                   \
2548         float_status *fpst = stat;                                      \
2549         intptr_t i, oprsz = simd_oprsz(desc);                           \
2550         uint32_t rmode = simd_data(desc);                               \
2551         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2552         TYPE *d = vd, *n = vn;                                          \
2553         set_float_rounding_mode(rmode, fpst);                           \
2554         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2555             d[i] = FUNC(n[i], fpst);                                    \
2556         }                                                               \
2557         set_float_rounding_mode(prev_rmode, fpst);                      \
2558         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2559     }
2560 
2561 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2562 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2563 
2564 #undef DO_VRINT_RMODE
2565 
2566 #ifdef TARGET_AARCH64
2567 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2568 {
2569     const uint8_t *indices = vm;
2570     CPUARMState *env = venv;
2571     size_t oprsz = simd_oprsz(desc);
2572     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2573     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2574     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2575     union {
2576         uint8_t b[16];
2577         uint64_t d[2];
2578     } result;
2579 
2580     /*
2581      * We must construct the final result in a temp, lest the output
2582      * overlaps the input table.  For TBL, begin with zero; for TBX,
2583      * begin with the original register contents.  Note that we always
2584      * copy 16 bytes here to avoid an extra branch; clearing the high
2585      * bits of the register for oprsz == 8 is handled below.
2586      */
2587     if (is_tbx) {
2588         memcpy(&result, vd, 16);
2589     } else {
2590         memset(&result, 0, 16);
2591     }
2592 
2593     for (size_t i = 0; i < oprsz; ++i) {
2594         uint32_t index = indices[H1(i)];
2595 
2596         if (index < table_len) {
2597             /*
2598              * Convert index (a byte offset into the virtual table
2599              * which is a series of 128-bit vectors concatenated)
2600              * into the correct register element, bearing in mind
2601              * that the table can wrap around from V31 to V0.
2602              */
2603             const uint8_t *table = (const uint8_t *)
2604                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2605             result.b[H1(i)] = table[H1(index % 16)];
2606         }
2607     }
2608 
2609     memcpy(vd, &result, 16);
2610     clear_tail(vd, oprsz, simd_maxsz(desc));
2611 }
2612 #endif
2613 
2614 /*
2615  * NxN -> N highpart multiply
2616  *
2617  * TODO: expose this as a generic vector operation.
2618  */
2619 
2620 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2621 {
2622     intptr_t i, opr_sz = simd_oprsz(desc);
2623     int8_t *d = vd, *n = vn, *m = vm;
2624 
2625     for (i = 0; i < opr_sz; ++i) {
2626         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2627     }
2628     clear_tail(d, opr_sz, simd_maxsz(desc));
2629 }
2630 
2631 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2632 {
2633     intptr_t i, opr_sz = simd_oprsz(desc);
2634     int16_t *d = vd, *n = vn, *m = vm;
2635 
2636     for (i = 0; i < opr_sz / 2; ++i) {
2637         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2638     }
2639     clear_tail(d, opr_sz, simd_maxsz(desc));
2640 }
2641 
2642 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2643 {
2644     intptr_t i, opr_sz = simd_oprsz(desc);
2645     int32_t *d = vd, *n = vn, *m = vm;
2646 
2647     for (i = 0; i < opr_sz / 4; ++i) {
2648         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2649     }
2650     clear_tail(d, opr_sz, simd_maxsz(desc));
2651 }
2652 
2653 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2654 {
2655     intptr_t i, opr_sz = simd_oprsz(desc);
2656     uint64_t *d = vd, *n = vn, *m = vm;
2657     uint64_t discard;
2658 
2659     for (i = 0; i < opr_sz / 8; ++i) {
2660         muls64(&discard, &d[i], n[i], m[i]);
2661     }
2662     clear_tail(d, opr_sz, simd_maxsz(desc));
2663 }
2664 
2665 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2666 {
2667     intptr_t i, opr_sz = simd_oprsz(desc);
2668     uint8_t *d = vd, *n = vn, *m = vm;
2669 
2670     for (i = 0; i < opr_sz; ++i) {
2671         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2672     }
2673     clear_tail(d, opr_sz, simd_maxsz(desc));
2674 }
2675 
2676 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2677 {
2678     intptr_t i, opr_sz = simd_oprsz(desc);
2679     uint16_t *d = vd, *n = vn, *m = vm;
2680 
2681     for (i = 0; i < opr_sz / 2; ++i) {
2682         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2683     }
2684     clear_tail(d, opr_sz, simd_maxsz(desc));
2685 }
2686 
2687 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2688 {
2689     intptr_t i, opr_sz = simd_oprsz(desc);
2690     uint32_t *d = vd, *n = vn, *m = vm;
2691 
2692     for (i = 0; i < opr_sz / 4; ++i) {
2693         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2694     }
2695     clear_tail(d, opr_sz, simd_maxsz(desc));
2696 }
2697 
2698 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2699 {
2700     intptr_t i, opr_sz = simd_oprsz(desc);
2701     uint64_t *d = vd, *n = vn, *m = vm;
2702     uint64_t discard;
2703 
2704     for (i = 0; i < opr_sz / 8; ++i) {
2705         mulu64(&discard, &d[i], n[i], m[i]);
2706     }
2707     clear_tail(d, opr_sz, simd_maxsz(desc));
2708 }
2709 
2710 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2711 {
2712     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2713     int shr = simd_data(desc);
2714     uint64_t *d = vd, *n = vn, *m = vm;
2715 
2716     for (i = 0; i < opr_sz; ++i) {
2717         d[i] = ror64(n[i] ^ m[i], shr);
2718     }
2719     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2720 }
2721 
2722 /*
2723  * Integer matrix-multiply accumulate
2724  */
2725 
2726 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2727 {
2728     int8_t *n = vn, *m = vm;
2729 
2730     for (intptr_t k = 0; k < 8; ++k) {
2731         sum += n[H1(k)] * m[H1(k)];
2732     }
2733     return sum;
2734 }
2735 
2736 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2737 {
2738     uint8_t *n = vn, *m = vm;
2739 
2740     for (intptr_t k = 0; k < 8; ++k) {
2741         sum += n[H1(k)] * m[H1(k)];
2742     }
2743     return sum;
2744 }
2745 
2746 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2747 {
2748     uint8_t *n = vn;
2749     int8_t *m = vm;
2750 
2751     for (intptr_t k = 0; k < 8; ++k) {
2752         sum += n[H1(k)] * m[H1(k)];
2753     }
2754     return sum;
2755 }
2756 
2757 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2758                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2759 {
2760     intptr_t seg, opr_sz = simd_oprsz(desc);
2761 
2762     for (seg = 0; seg < opr_sz; seg += 16) {
2763         uint32_t *d = vd + seg;
2764         uint32_t *a = va + seg;
2765         uint32_t sum0, sum1, sum2, sum3;
2766 
2767         /*
2768          * Process the entire segment at once, writing back the
2769          * results only after we've consumed all of the inputs.
2770          *
2771          * Key to indices by column:
2772          *          i   j                  i             j
2773          */
2774         sum0 = a[H4(0 + 0)];
2775         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2776         sum1 = a[H4(0 + 1)];
2777         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2778         sum2 = a[H4(2 + 0)];
2779         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2780         sum3 = a[H4(2 + 1)];
2781         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2782 
2783         d[H4(0)] = sum0;
2784         d[H4(1)] = sum1;
2785         d[H4(2)] = sum2;
2786         d[H4(3)] = sum3;
2787     }
2788     clear_tail(vd, opr_sz, simd_maxsz(desc));
2789 }
2790 
2791 #define DO_MMLA_B(NAME, INNER) \
2792     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2793     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2794 
2795 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2796 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2797 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2798 
2799 /*
2800  * BFloat16 Dot Product
2801  */
2802 
2803 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2804 {
2805     /*
2806      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2807      * For EBF = 0, we ignore the FPCR bits which determine rounding
2808      * mode and denormal-flushing, and we do unfused multiplies and
2809      * additions with intermediate rounding of all products and sums.
2810      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2811      * and we perform a fused two-way sum-of-products without intermediate
2812      * rounding of the products.
2813      * In either case, we don't set fp exception flags.
2814      *
2815      * EBF is AArch64 only, so even if it's set in the FPCR it has
2816      * no effect on AArch32 instructions.
2817      */
2818     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2819 
2820     *statusp = env->vfp.fp_status;
2821     set_default_nan_mode(true, statusp);
2822 
2823     if (ebf) {
2824         /* EBF=1 needs to do a step with round-to-odd semantics */
2825         *oddstatusp = *statusp;
2826         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2827     } else {
2828         set_flush_to_zero(true, statusp);
2829         set_flush_inputs_to_zero(true, statusp);
2830         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2831     }
2832     return ebf;
2833 }
2834 
2835 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2836 {
2837     float32 t1, t2;
2838 
2839     /*
2840      * Extract each BFloat16 from the element pair, and shift
2841      * them such that they become float32.
2842      */
2843     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2844     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2845     t1 = float32_add(t1, t2, fpst);
2846     t1 = float32_add(sum, t1, fpst);
2847 
2848     return t1;
2849 }
2850 
2851 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2852                      float_status *fpst, float_status *fpst_odd)
2853 {
2854     /*
2855      * Compare f16_dotadd() in sme_helper.c, but here we have
2856      * bfloat16 inputs. In particular that means that we do not
2857      * want the FPCR.FZ16 flush semantics, so we use the normal
2858      * float_status for the input handling here.
2859      */
2860     float64 e1r = float32_to_float64(e1 << 16, fpst);
2861     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2862     float64 e2r = float32_to_float64(e2 << 16, fpst);
2863     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2864     float64 t64;
2865     float32 t32;
2866 
2867     /*
2868      * The ARM pseudocode function FPDot performs both multiplies
2869      * and the add with a single rounding operation.  Emulate this
2870      * by performing the first multiply in round-to-odd, then doing
2871      * the second multiply as fused multiply-add, and rounding to
2872      * float32 all in one step.
2873      */
2874     t64 = float64_mul(e1r, e2r, fpst_odd);
2875     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2876 
2877     /* This conversion is exact, because we've already rounded. */
2878     t32 = float64_to_float32(t64, fpst);
2879 
2880     /* The final accumulation step is not fused. */
2881     return float32_add(sum, t32, fpst);
2882 }
2883 
2884 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2885                         CPUARMState *env, uint32_t desc)
2886 {
2887     intptr_t i, opr_sz = simd_oprsz(desc);
2888     float32 *d = vd, *a = va;
2889     uint32_t *n = vn, *m = vm;
2890     float_status fpst, fpst_odd;
2891 
2892     if (is_ebf(env, &fpst, &fpst_odd)) {
2893         for (i = 0; i < opr_sz / 4; ++i) {
2894             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2895         }
2896     } else {
2897         for (i = 0; i < opr_sz / 4; ++i) {
2898             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2899         }
2900     }
2901     clear_tail(d, opr_sz, simd_maxsz(desc));
2902 }
2903 
2904 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2905                             void *va, CPUARMState *env, uint32_t desc)
2906 {
2907     intptr_t i, j, opr_sz = simd_oprsz(desc);
2908     intptr_t index = simd_data(desc);
2909     intptr_t elements = opr_sz / 4;
2910     intptr_t eltspersegment = MIN(16 / 4, elements);
2911     float32 *d = vd, *a = va;
2912     uint32_t *n = vn, *m = vm;
2913     float_status fpst, fpst_odd;
2914 
2915     if (is_ebf(env, &fpst, &fpst_odd)) {
2916         for (i = 0; i < elements; i += eltspersegment) {
2917             uint32_t m_idx = m[i + H4(index)];
2918 
2919             for (j = i; j < i + eltspersegment; j++) {
2920                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2921             }
2922         }
2923     } else {
2924         for (i = 0; i < elements; i += eltspersegment) {
2925             uint32_t m_idx = m[i + H4(index)];
2926 
2927             for (j = i; j < i + eltspersegment; j++) {
2928                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2929             }
2930         }
2931     }
2932     clear_tail(d, opr_sz, simd_maxsz(desc));
2933 }
2934 
2935 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2936                          CPUARMState *env, uint32_t desc)
2937 {
2938     intptr_t s, opr_sz = simd_oprsz(desc);
2939     float32 *d = vd, *a = va;
2940     uint32_t *n = vn, *m = vm;
2941     float_status fpst, fpst_odd;
2942 
2943     if (is_ebf(env, &fpst, &fpst_odd)) {
2944         for (s = 0; s < opr_sz / 4; s += 4) {
2945             float32 sum00, sum01, sum10, sum11;
2946 
2947             /*
2948              * Process the entire segment at once, writing back the
2949              * results only after we've consumed all of the inputs.
2950              *
2951              * Key to indices by column:
2952              *               i   j               i   k             j   k
2953              */
2954             sum00 = a[s + H4(0 + 0)];
2955             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2956             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2957 
2958             sum01 = a[s + H4(0 + 1)];
2959             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2960             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2961 
2962             sum10 = a[s + H4(2 + 0)];
2963             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2964             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2965 
2966             sum11 = a[s + H4(2 + 1)];
2967             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2968             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2969 
2970             d[s + H4(0 + 0)] = sum00;
2971             d[s + H4(0 + 1)] = sum01;
2972             d[s + H4(2 + 0)] = sum10;
2973             d[s + H4(2 + 1)] = sum11;
2974         }
2975     } else {
2976         for (s = 0; s < opr_sz / 4; s += 4) {
2977             float32 sum00, sum01, sum10, sum11;
2978 
2979             /*
2980              * Process the entire segment at once, writing back the
2981              * results only after we've consumed all of the inputs.
2982              *
2983              * Key to indices by column:
2984              *               i   j           i   k             j   k
2985              */
2986             sum00 = a[s + H4(0 + 0)];
2987             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2988             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2989 
2990             sum01 = a[s + H4(0 + 1)];
2991             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2992             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2993 
2994             sum10 = a[s + H4(2 + 0)];
2995             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2996             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
2997 
2998             sum11 = a[s + H4(2 + 1)];
2999             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3000             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3001 
3002             d[s + H4(0 + 0)] = sum00;
3003             d[s + H4(0 + 1)] = sum01;
3004             d[s + H4(2 + 0)] = sum10;
3005             d[s + H4(2 + 1)] = sum11;
3006         }
3007     }
3008     clear_tail(d, opr_sz, simd_maxsz(desc));
3009 }
3010 
3011 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3012                          void *stat, uint32_t desc)
3013 {
3014     intptr_t i, opr_sz = simd_oprsz(desc);
3015     intptr_t sel = simd_data(desc);
3016     float32 *d = vd, *a = va;
3017     bfloat16 *n = vn, *m = vm;
3018 
3019     for (i = 0; i < opr_sz / 4; ++i) {
3020         float32 nn = n[H2(i * 2 + sel)] << 16;
3021         float32 mm = m[H2(i * 2 + sel)] << 16;
3022         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3023     }
3024     clear_tail(d, opr_sz, simd_maxsz(desc));
3025 }
3026 
3027 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3028                              void *va, void *stat, uint32_t desc)
3029 {
3030     intptr_t i, j, opr_sz = simd_oprsz(desc);
3031     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3032     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3033     intptr_t elements = opr_sz / 4;
3034     intptr_t eltspersegment = MIN(16 / 4, elements);
3035     float32 *d = vd, *a = va;
3036     bfloat16 *n = vn, *m = vm;
3037 
3038     for (i = 0; i < elements; i += eltspersegment) {
3039         float32 m_idx = m[H2(2 * i + index)] << 16;
3040 
3041         for (j = i; j < i + eltspersegment; j++) {
3042             float32 n_j = n[H2(2 * j + sel)] << 16;
3043             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3044         }
3045     }
3046     clear_tail(d, opr_sz, simd_maxsz(desc));
3047 }
3048 
3049 #define DO_CLAMP(NAME, TYPE) \
3050 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3051 {                                                                       \
3052     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3053     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3054         TYPE aa = *(TYPE *)(a + i);                                     \
3055         TYPE nn = *(TYPE *)(n + i);                                     \
3056         TYPE mm = *(TYPE *)(m + i);                                     \
3057         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3058         *(TYPE *)(d + i) = dd;                                          \
3059     }                                                                   \
3060     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3061 }
3062 
3063 DO_CLAMP(gvec_sclamp_b, int8_t)
3064 DO_CLAMP(gvec_sclamp_h, int16_t)
3065 DO_CLAMP(gvec_sclamp_s, int32_t)
3066 DO_CLAMP(gvec_sclamp_d, int64_t)
3067 
3068 DO_CLAMP(gvec_uclamp_b, uint8_t)
3069 DO_CLAMP(gvec_uclamp_h, uint16_t)
3070 DO_CLAMP(gvec_uclamp_s, uint32_t)
3071 DO_CLAMP(gvec_uclamp_d, uint64_t)
3072 
3073 /* Bit count in each 8-bit word. */
3074 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3075 {
3076     intptr_t i, opr_sz = simd_oprsz(desc);
3077     uint8_t *d = vd, *n = vn;
3078 
3079     for (i = 0; i < opr_sz; ++i) {
3080         d[i] = ctpop8(n[i]);
3081     }
3082     clear_tail(d, opr_sz, simd_maxsz(desc));
3083 }
3084 
3085 /* Reverse bits in each 8 bit word */
3086 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3087 {
3088     intptr_t i, opr_sz = simd_oprsz(desc);
3089     uint64_t *d = vd, *n = vn;
3090 
3091     for (i = 0; i < opr_sz / 8; ++i) {
3092         d[i] = revbit64(bswap64(n[i]));
3093     }
3094     clear_tail(d, opr_sz, simd_maxsz(desc));
3095 }
3096