xref: /qemu/target/arm/tcg/vec_helper.c (revision f069b26b8e6b7d03399fc4fd7540229f1cb45e27)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
883     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
884     uintptr_t i;
885 
886     for (i = 0; i < opr_sz / 2; i += 2) {
887         float16 e0 = n[H2(i)];
888         float16 e1 = m[H2(i + 1)];
889         float16 e2 = n[H2(i + 1)];
890         float16 e3 = m[H2(i)];
891 
892         if (rot) {
893             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
894         } else {
895             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
896         }
897 
898         d[H2(i)] = float16_add(e0, e1, fpst);
899         d[H2(i + 1)] = float16_add(e2, e3, fpst);
900     }
901     clear_tail(d, opr_sz, simd_maxsz(desc));
902 }
903 
904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
905                          float_status *fpst, uint32_t desc)
906 {
907     uintptr_t opr_sz = simd_oprsz(desc);
908     float32 *d = vd;
909     float32 *n = vn;
910     float32 *m = vm;
911     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
912     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
913     uintptr_t i;
914 
915     for (i = 0; i < opr_sz / 4; i += 2) {
916         float32 e0 = n[H4(i)];
917         float32 e1 = m[H4(i + 1)];
918         float32 e2 = n[H4(i + 1)];
919         float32 e3 = m[H4(i)];
920 
921         if (rot) {
922             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
923         } else {
924             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
925         }
926 
927         d[H4(i)] = float32_add(e0, e1, fpst);
928         d[H4(i + 1)] = float32_add(e2, e3, fpst);
929     }
930     clear_tail(d, opr_sz, simd_maxsz(desc));
931 }
932 
933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
934                          float_status *fpst, uint32_t desc)
935 {
936     uintptr_t opr_sz = simd_oprsz(desc);
937     float64 *d = vd;
938     float64 *n = vn;
939     float64 *m = vm;
940     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
941     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
942     uintptr_t i;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1];
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i];
949 
950         if (rot) {
951             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
952         } else {
953             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
954         }
955 
956         d[i] = float64_add(e0, e1, fpst);
957         d[i + 1] = float64_add(e2, e3, fpst);
958     }
959     clear_tail(d, opr_sz, simd_maxsz(desc));
960 }
961 
962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
963                          float_status *fpst, uint32_t desc)
964 {
965     uintptr_t opr_sz = simd_oprsz(desc);
966     float16 *d = vd, *n = vn, *m = vm, *a = va;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
969     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
970     uint32_t negf_real = flip ^ negf_imag;
971     float16 negx_imag, negx_real;
972     uintptr_t i;
973 
974     /* With AH=0, use negx; with AH=1 use negf. */
975     negx_real = (negf_real & ~fpcr_ah) << 15;
976     negx_imag = (negf_imag & ~fpcr_ah) << 15;
977     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
978     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
979 
980     for (i = 0; i < opr_sz / 2; i += 2) {
981         float16 e2 = n[H2(i + flip)];
982         float16 e1 = m[H2(i + flip)] ^ negx_real;
983         float16 e4 = e2;
984         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
985 
986         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
987         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
988     }
989     clear_tail(d, opr_sz, simd_maxsz(desc));
990 }
991 
992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
993                              float_status *fpst, uint32_t desc)
994 {
995     uintptr_t opr_sz = simd_oprsz(desc);
996     float16 *d = vd, *n = vn, *m = vm, *a = va;
997     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
998     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
999     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1000     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1001     uint32_t negf_real = flip ^ negf_imag;
1002     intptr_t elements = opr_sz / sizeof(float16);
1003     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1004     float16 negx_imag, negx_real;
1005     intptr_t i, j;
1006 
1007     /* With AH=0, use negx; with AH=1 use negf. */
1008     negx_real = (negf_real & ~fpcr_ah) << 15;
1009     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1010     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1011     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1012 
1013     for (i = 0; i < elements; i += eltspersegment) {
1014         float16 mr = m[H2(i + 2 * index + 0)];
1015         float16 mi = m[H2(i + 2 * index + 1)];
1016         float16 e1 = negx_real ^ (flip ? mi : mr);
1017         float16 e3 = negx_imag ^ (flip ? mr : mi);
1018 
1019         for (j = i; j < i + eltspersegment; j += 2) {
1020             float16 e2 = n[H2(j + flip)];
1021             float16 e4 = e2;
1022 
1023             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst);
1024             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst);
1025         }
1026     }
1027     clear_tail(d, opr_sz, simd_maxsz(desc));
1028 }
1029 
1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1031                          float_status *fpst, uint32_t desc)
1032 {
1033     uintptr_t opr_sz = simd_oprsz(desc);
1034     float32 *d = vd, *n = vn, *m = vm, *a = va;
1035     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1036     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1037     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1038     uint32_t negf_real = flip ^ negf_imag;
1039     float32 negx_imag, negx_real;
1040     uintptr_t i;
1041 
1042     /* With AH=0, use negx; with AH=1 use negf. */
1043     negx_real = (negf_real & ~fpcr_ah) << 31;
1044     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1045     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1046     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1047 
1048     for (i = 0; i < opr_sz / 4; i += 2) {
1049         float32 e2 = n[H4(i + flip)];
1050         float32 e1 = m[H4(i + flip)] ^ negx_real;
1051         float32 e4 = e2;
1052         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1053 
1054         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1055         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1056     }
1057     clear_tail(d, opr_sz, simd_maxsz(desc));
1058 }
1059 
1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1061                              float_status *fpst, uint32_t desc)
1062 {
1063     uintptr_t opr_sz = simd_oprsz(desc);
1064     float32 *d = vd, *n = vn, *m = vm, *a = va;
1065     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1066     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1067     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1068     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1069     uint32_t negf_real = flip ^ negf_imag;
1070     intptr_t elements = opr_sz / sizeof(float32);
1071     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1072     float32 negx_imag, negx_real;
1073     intptr_t i, j;
1074 
1075     /* With AH=0, use negx; with AH=1 use negf. */
1076     negx_real = (negf_real & ~fpcr_ah) << 31;
1077     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1078     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1079     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1080 
1081     for (i = 0; i < elements; i += eltspersegment) {
1082         float32 mr = m[H4(i + 2 * index + 0)];
1083         float32 mi = m[H4(i + 2 * index + 1)];
1084         float32 e1 = negx_real ^ (flip ? mi : mr);
1085         float32 e3 = negx_imag ^ (flip ? mr : mi);
1086 
1087         for (j = i; j < i + eltspersegment; j += 2) {
1088             float32 e2 = n[H4(j + flip)];
1089             float32 e4 = e2;
1090 
1091             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst);
1092             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst);
1093         }
1094     }
1095     clear_tail(d, opr_sz, simd_maxsz(desc));
1096 }
1097 
1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1099                          float_status *fpst, uint32_t desc)
1100 {
1101     uintptr_t opr_sz = simd_oprsz(desc);
1102     float64 *d = vd, *n = vn, *m = vm, *a = va;
1103     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1104     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1105     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1106     uint32_t negf_real = flip ^ negf_imag;
1107     float64 negx_real, negx_imag;
1108     uintptr_t i;
1109 
1110     /* With AH=0, use negx; with AH=1 use negf. */
1111     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1112     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1113     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1114     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1115 
1116     for (i = 0; i < opr_sz / 8; i += 2) {
1117         float64 e2 = n[i + flip];
1118         float64 e1 = m[i + flip] ^ negx_real;
1119         float64 e4 = e2;
1120         float64 e3 = m[i + 1 - flip] ^ negx_imag;
1121 
1122         d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1123         d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1124     }
1125     clear_tail(d, opr_sz, simd_maxsz(desc));
1126 }
1127 
1128 /*
1129  * Floating point comparisons producing an integer result (all 1s or all 0s).
1130  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1131  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1132  */
1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1134 {
1135     return -float16_eq_quiet(op1, op2, stat);
1136 }
1137 
1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1139 {
1140     return -float32_eq_quiet(op1, op2, stat);
1141 }
1142 
1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1144 {
1145     return -float64_eq_quiet(op1, op2, stat);
1146 }
1147 
1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1149 {
1150     return -float16_le(op2, op1, stat);
1151 }
1152 
1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1154 {
1155     return -float32_le(op2, op1, stat);
1156 }
1157 
1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1159 {
1160     return -float64_le(op2, op1, stat);
1161 }
1162 
1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1164 {
1165     return -float16_lt(op2, op1, stat);
1166 }
1167 
1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1169 {
1170     return -float32_lt(op2, op1, stat);
1171 }
1172 
1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1174 {
1175     return -float64_lt(op2, op1, stat);
1176 }
1177 
1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1179 {
1180     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1181 }
1182 
1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1184 {
1185     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1186 }
1187 
1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1189 {
1190     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1191 }
1192 
1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1194 {
1195     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1196 }
1197 
1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1199 {
1200     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1201 }
1202 
1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1204 {
1205     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1206 }
1207 
1208 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1209 {
1210     if (float16_is_any_nan(x)) {
1211         float_raise(float_flag_invalid, fpst);
1212         return 0;
1213     }
1214     return float16_to_int16_round_to_zero(x, fpst);
1215 }
1216 
1217 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1218 {
1219     if (float16_is_any_nan(x)) {
1220         float_raise(float_flag_invalid, fpst);
1221         return 0;
1222     }
1223     return float16_to_uint16_round_to_zero(x, fpst);
1224 }
1225 
1226 #define DO_2OP(NAME, FUNC, TYPE) \
1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1228 {                                                                 \
1229     intptr_t i, oprsz = simd_oprsz(desc);                         \
1230     TYPE *d = vd, *n = vn;                                        \
1231     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1232         d[i] = FUNC(n[i], stat);                                  \
1233     }                                                             \
1234     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1235 }
1236 
1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1239 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32)
1240 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1241 
1242 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1243 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1244 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32)
1245 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1246 
1247 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1248 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1249 
1250 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1251 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1252 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1253 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1254 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1255 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1256 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1257 DO_2OP(gvec_touszh, vfp_touszh, float16)
1258 
1259 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1260     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1261     {                                                           \
1262         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1263     }
1264 
1265 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1266     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1267     {                                                           \
1268         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1269     }
1270 
1271 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1272     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1273     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1274     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1275     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1276     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1277     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1278 
1279 DO_2OP_CMP0(cgt, cgt, FWD)
1280 DO_2OP_CMP0(cge, cge, FWD)
1281 DO_2OP_CMP0(ceq, ceq, FWD)
1282 DO_2OP_CMP0(clt, cgt, REV)
1283 DO_2OP_CMP0(cle, cge, REV)
1284 
1285 #undef DO_2OP
1286 #undef DO_2OP_CMP0
1287 
1288 /* Floating-point trigonometric starting value.
1289  * See the ARM ARM pseudocode function FPTrigSMul.
1290  */
1291 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1292 {
1293     float16 result = float16_mul(op1, op1, stat);
1294     if (!float16_is_any_nan(result)) {
1295         result = float16_set_sign(result, op2 & 1);
1296     }
1297     return result;
1298 }
1299 
1300 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1301 {
1302     float32 result = float32_mul(op1, op1, stat);
1303     if (!float32_is_any_nan(result)) {
1304         result = float32_set_sign(result, op2 & 1);
1305     }
1306     return result;
1307 }
1308 
1309 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1310 {
1311     float64 result = float64_mul(op1, op1, stat);
1312     if (!float64_is_any_nan(result)) {
1313         result = float64_set_sign(result, op2 & 1);
1314     }
1315     return result;
1316 }
1317 
1318 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1319 {
1320     return float16_abs(float16_sub(op1, op2, stat));
1321 }
1322 
1323 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1324 {
1325     return float32_abs(float32_sub(op1, op2, stat));
1326 }
1327 
1328 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1329 {
1330     return float64_abs(float64_sub(op1, op2, stat));
1331 }
1332 
1333 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1334 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1335 {
1336     float16 r = float16_sub(op1, op2, stat);
1337     return float16_is_any_nan(r) ? r : float16_abs(r);
1338 }
1339 
1340 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1341 {
1342     float32 r = float32_sub(op1, op2, stat);
1343     return float32_is_any_nan(r) ? r : float32_abs(r);
1344 }
1345 
1346 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1347 {
1348     float64 r = float64_sub(op1, op2, stat);
1349     return float64_is_any_nan(r) ? r : float64_abs(r);
1350 }
1351 
1352 /*
1353  * Reciprocal step. These are the AArch32 version which uses a
1354  * non-fused multiply-and-subtract.
1355  */
1356 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1357 {
1358     op1 = float16_squash_input_denormal(op1, stat);
1359     op2 = float16_squash_input_denormal(op2, stat);
1360 
1361     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1362         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1363         return float16_two;
1364     }
1365     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1366 }
1367 
1368 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1369 {
1370     op1 = float32_squash_input_denormal(op1, stat);
1371     op2 = float32_squash_input_denormal(op2, stat);
1372 
1373     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1374         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1375         return float32_two;
1376     }
1377     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1378 }
1379 
1380 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1381 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1382 {
1383     op1 = float16_squash_input_denormal(op1, stat);
1384     op2 = float16_squash_input_denormal(op2, stat);
1385 
1386     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1387         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1388         return float16_one_point_five;
1389     }
1390     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1391     return float16_div(op1, float16_two, stat);
1392 }
1393 
1394 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1395 {
1396     op1 = float32_squash_input_denormal(op1, stat);
1397     op2 = float32_squash_input_denormal(op2, stat);
1398 
1399     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1400         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1401         return float32_one_point_five;
1402     }
1403     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1404     return float32_div(op1, float32_two, stat);
1405 }
1406 
1407 #define DO_3OP(NAME, FUNC, TYPE) \
1408 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1409                   float_status *stat, uint32_t desc)                       \
1410 {                                                                          \
1411     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1412     TYPE *d = vd, *n = vn, *m = vm;                                        \
1413     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1414         d[i] = FUNC(n[i], m[i], stat);                                     \
1415     }                                                                      \
1416     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1417 }
1418 
1419 DO_3OP(gvec_fadd_h, float16_add, float16)
1420 DO_3OP(gvec_fadd_s, float32_add, float32)
1421 DO_3OP(gvec_fadd_d, float64_add, float64)
1422 
1423 DO_3OP(gvec_fsub_h, float16_sub, float16)
1424 DO_3OP(gvec_fsub_s, float32_sub, float32)
1425 DO_3OP(gvec_fsub_d, float64_sub, float64)
1426 
1427 DO_3OP(gvec_fmul_h, float16_mul, float16)
1428 DO_3OP(gvec_fmul_s, float32_mul, float32)
1429 DO_3OP(gvec_fmul_d, float64_mul, float64)
1430 
1431 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1432 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1433 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1434 
1435 DO_3OP(gvec_fabd_h, float16_abd, float16)
1436 DO_3OP(gvec_fabd_s, float32_abd, float32)
1437 DO_3OP(gvec_fabd_d, float64_abd, float64)
1438 
1439 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1440 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1441 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1442 
1443 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1444 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1445 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1446 
1447 DO_3OP(gvec_fcge_h, float16_cge, float16)
1448 DO_3OP(gvec_fcge_s, float32_cge, float32)
1449 DO_3OP(gvec_fcge_d, float64_cge, float64)
1450 
1451 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1452 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1453 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1454 
1455 DO_3OP(gvec_facge_h, float16_acge, float16)
1456 DO_3OP(gvec_facge_s, float32_acge, float32)
1457 DO_3OP(gvec_facge_d, float64_acge, float64)
1458 
1459 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1460 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1461 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1462 
1463 DO_3OP(gvec_fmax_h, float16_max, float16)
1464 DO_3OP(gvec_fmax_s, float32_max, float32)
1465 DO_3OP(gvec_fmax_d, float64_max, float64)
1466 
1467 DO_3OP(gvec_fmin_h, float16_min, float16)
1468 DO_3OP(gvec_fmin_s, float32_min, float32)
1469 DO_3OP(gvec_fmin_d, float64_min, float64)
1470 
1471 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1472 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1473 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1474 
1475 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1476 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1477 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1478 
1479 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1480 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1481 
1482 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1483 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1484 
1485 #ifdef TARGET_AARCH64
1486 DO_3OP(gvec_fdiv_h, float16_div, float16)
1487 DO_3OP(gvec_fdiv_s, float32_div, float32)
1488 DO_3OP(gvec_fdiv_d, float64_div, float64)
1489 
1490 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1491 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1492 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1493 
1494 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1495 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1496 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1497 
1498 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1499 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1500 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1501 
1502 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1503 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1504 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1505 
1506 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1507 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1508 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1509 
1510 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1511 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1512 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1513 
1514 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1515 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1516 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1517 
1518 #endif
1519 #undef DO_3OP
1520 
1521 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1522 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1523                                  float_status *stat)
1524 {
1525     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1526 }
1527 
1528 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1529                                  float_status *stat)
1530 {
1531     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1532 }
1533 
1534 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1535                                  float_status *stat)
1536 {
1537     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1538 }
1539 
1540 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1541                                  float_status *stat)
1542 {
1543     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1544 }
1545 
1546 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1547 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1548                                 float_status *stat)
1549 {
1550     return float16_muladd(op1, op2, dest, 0, stat);
1551 }
1552 
1553 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1554                                  float_status *stat)
1555 {
1556     return float32_muladd(op1, op2, dest, 0, stat);
1557 }
1558 
1559 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1560                                  float_status *stat)
1561 {
1562     return float64_muladd(op1, op2, dest, 0, stat);
1563 }
1564 
1565 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1566                                  float_status *stat)
1567 {
1568     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1569 }
1570 
1571 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1572                                  float_status *stat)
1573 {
1574     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1575 }
1576 
1577 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1578                                  float_status *stat)
1579 {
1580     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1581 }
1582 
1583 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1584                                  float_status *stat)
1585 {
1586     return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1587 }
1588 
1589 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1590                                  float_status *stat)
1591 {
1592     return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1593 }
1594 
1595 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1596                                  float_status *stat)
1597 {
1598     return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1599 }
1600 
1601 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1602 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1603                   float_status *stat, uint32_t desc)                       \
1604 {                                                                          \
1605     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1606     TYPE *d = vd, *n = vn, *m = vm;                                        \
1607     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1608         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1609     }                                                                      \
1610     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1611 }
1612 
1613 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1614 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1615 
1616 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1617 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1618 
1619 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1620 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1621 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1622 
1623 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1624 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1625 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1626 
1627 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1628 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1629 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1630 
1631 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1632  * For AdvSIMD, there is of course only one such vector segment.
1633  */
1634 
1635 #define DO_MUL_IDX(NAME, TYPE, H) \
1636 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1637 {                                                                          \
1638     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1639     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1640     intptr_t idx = simd_data(desc);                                        \
1641     TYPE *d = vd, *n = vn, *m = vm;                                        \
1642     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1643         TYPE mm = m[H(i + idx)];                                           \
1644         for (j = 0; j < segment; j++) {                                    \
1645             d[i + j] = n[i + j] * mm;                                      \
1646         }                                                                  \
1647     }                                                                      \
1648     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1649 }
1650 
1651 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1652 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1653 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1654 
1655 #undef DO_MUL_IDX
1656 
1657 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1658 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1659 {                                                                          \
1660     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1661     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1662     intptr_t idx = simd_data(desc);                                        \
1663     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1664     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1665         TYPE mm = m[H(i + idx)];                                           \
1666         for (j = 0; j < segment; j++) {                                    \
1667             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1668         }                                                                  \
1669     }                                                                      \
1670     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1671 }
1672 
1673 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1674 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1675 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1676 
1677 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1678 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1679 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1680 
1681 #undef DO_MLA_IDX
1682 
1683 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1684 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1685                   float_status *stat, uint32_t desc)                       \
1686 {                                                                          \
1687     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1688     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1689     intptr_t idx = simd_data(desc);                                        \
1690     TYPE *d = vd, *n = vn, *m = vm;                                        \
1691     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1692         TYPE mm = m[H(i + idx)];                                           \
1693         for (j = 0; j < segment; j++) {                                    \
1694             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1695         }                                                                  \
1696     }                                                                      \
1697     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1698 }
1699 
1700 #define nop(N, M, S) (M)
1701 
1702 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1703 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1704 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1705 
1706 #ifdef TARGET_AARCH64
1707 
1708 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1709 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1710 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1711 
1712 #endif
1713 
1714 #undef nop
1715 
1716 /*
1717  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1718  * the fused ops below they assume accumulate both from and into Vd.
1719  */
1720 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1721 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1722 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1723 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1724 
1725 #undef DO_FMUL_IDX
1726 
1727 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF)                             \
1728 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1729                   float_status *stat, uint32_t desc)                       \
1730 {                                                                          \
1731     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1732     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1733     intptr_t idx = simd_data(desc);                                        \
1734     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1735     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1736         TYPE mm = m[H(i + idx)];                                           \
1737         for (j = 0; j < segment; j++) {                                    \
1738             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1739                                      a[i + j], NEGF, stat);                \
1740         }                                                                  \
1741     }                                                                      \
1742     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1743 }
1744 
1745 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1746 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1747 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1748 
1749 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1750 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1751 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1752 
1753 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1754 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1755 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1756 
1757 #undef DO_FMLA_IDX
1758 
1759 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1760 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1761 {                                                                          \
1762     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1763     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1764     bool q = false;                                                        \
1765     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1766         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1767         if (dd < MIN) {                                                    \
1768             dd = MIN;                                                      \
1769             q = true;                                                      \
1770         } else if (dd > MAX) {                                             \
1771             dd = MAX;                                                      \
1772             q = true;                                                      \
1773         }                                                                  \
1774         d[i] = dd;                                                         \
1775     }                                                                      \
1776     if (q) {                                                               \
1777         uint32_t *qc = vq;                                                 \
1778         qc[0] = 1;                                                         \
1779     }                                                                      \
1780     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1781 }
1782 
1783 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1784 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1785 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1786 
1787 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1788 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1789 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1790 
1791 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1792 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1793 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1794 
1795 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1796 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1797 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1798 
1799 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1800 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1801 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1802 
1803 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1804 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1805 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1806 
1807 #undef DO_SAT
1808 
1809 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1810                           void *vm, uint32_t desc)
1811 {
1812     intptr_t i, oprsz = simd_oprsz(desc);
1813     uint64_t *d = vd, *n = vn, *m = vm;
1814     bool q = false;
1815 
1816     for (i = 0; i < oprsz / 8; i++) {
1817         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1818         if (dd < nn) {
1819             dd = UINT64_MAX;
1820             q = true;
1821         }
1822         d[i] = dd;
1823     }
1824     if (q) {
1825         uint32_t *qc = vq;
1826         qc[0] = 1;
1827     }
1828     clear_tail(d, oprsz, simd_maxsz(desc));
1829 }
1830 
1831 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1832                           void *vm, uint32_t desc)
1833 {
1834     intptr_t i, oprsz = simd_oprsz(desc);
1835     uint64_t *d = vd, *n = vn, *m = vm;
1836     bool q = false;
1837 
1838     for (i = 0; i < oprsz / 8; i++) {
1839         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1840         if (nn < mm) {
1841             dd = 0;
1842             q = true;
1843         }
1844         d[i] = dd;
1845     }
1846     if (q) {
1847         uint32_t *qc = vq;
1848         qc[0] = 1;
1849     }
1850     clear_tail(d, oprsz, simd_maxsz(desc));
1851 }
1852 
1853 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1854                           void *vm, uint32_t desc)
1855 {
1856     intptr_t i, oprsz = simd_oprsz(desc);
1857     int64_t *d = vd, *n = vn, *m = vm;
1858     bool q = false;
1859 
1860     for (i = 0; i < oprsz / 8; i++) {
1861         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1862         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1863             dd = (nn >> 63) ^ ~INT64_MIN;
1864             q = true;
1865         }
1866         d[i] = dd;
1867     }
1868     if (q) {
1869         uint32_t *qc = vq;
1870         qc[0] = 1;
1871     }
1872     clear_tail(d, oprsz, simd_maxsz(desc));
1873 }
1874 
1875 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1876                           void *vm, uint32_t desc)
1877 {
1878     intptr_t i, oprsz = simd_oprsz(desc);
1879     int64_t *d = vd, *n = vn, *m = vm;
1880     bool q = false;
1881 
1882     for (i = 0; i < oprsz / 8; i++) {
1883         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1884         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1885             dd = (nn >> 63) ^ ~INT64_MIN;
1886             q = true;
1887         }
1888         d[i] = dd;
1889     }
1890     if (q) {
1891         uint32_t *qc = vq;
1892         qc[0] = 1;
1893     }
1894     clear_tail(d, oprsz, simd_maxsz(desc));
1895 }
1896 
1897 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1898                            void *vm, uint32_t desc)
1899 {
1900     intptr_t i, oprsz = simd_oprsz(desc);
1901     uint64_t *d = vd, *n = vn, *m = vm;
1902     bool q = false;
1903 
1904     for (i = 0; i < oprsz / 8; i++) {
1905         uint64_t nn = n[i];
1906         int64_t mm = m[i];
1907         uint64_t dd = nn + mm;
1908 
1909         if (mm < 0) {
1910             if (nn < (uint64_t)-mm) {
1911                 dd = 0;
1912                 q = true;
1913             }
1914         } else {
1915             if (dd < nn) {
1916                 dd = UINT64_MAX;
1917                 q = true;
1918             }
1919         }
1920         d[i] = dd;
1921     }
1922     if (q) {
1923         uint32_t *qc = vq;
1924         qc[0] = 1;
1925     }
1926     clear_tail(d, oprsz, simd_maxsz(desc));
1927 }
1928 
1929 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1930                            void *vm, uint32_t desc)
1931 {
1932     intptr_t i, oprsz = simd_oprsz(desc);
1933     uint64_t *d = vd, *n = vn, *m = vm;
1934     bool q = false;
1935 
1936     for (i = 0; i < oprsz / 8; i++) {
1937         int64_t nn = n[i];
1938         uint64_t mm = m[i];
1939         int64_t dd = nn + mm;
1940 
1941         if (mm > (uint64_t)(INT64_MAX - nn)) {
1942             dd = INT64_MAX;
1943             q = true;
1944         }
1945         d[i] = dd;
1946     }
1947     if (q) {
1948         uint32_t *qc = vq;
1949         qc[0] = 1;
1950     }
1951     clear_tail(d, oprsz, simd_maxsz(desc));
1952 }
1953 
1954 #define DO_SRA(NAME, TYPE)                              \
1955 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1956 {                                                       \
1957     intptr_t i, oprsz = simd_oprsz(desc);               \
1958     int shift = simd_data(desc);                        \
1959     TYPE *d = vd, *n = vn;                              \
1960     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1961         d[i] += n[i] >> shift;                          \
1962     }                                                   \
1963     clear_tail(d, oprsz, simd_maxsz(desc));             \
1964 }
1965 
1966 DO_SRA(gvec_ssra_b, int8_t)
1967 DO_SRA(gvec_ssra_h, int16_t)
1968 DO_SRA(gvec_ssra_s, int32_t)
1969 DO_SRA(gvec_ssra_d, int64_t)
1970 
1971 DO_SRA(gvec_usra_b, uint8_t)
1972 DO_SRA(gvec_usra_h, uint16_t)
1973 DO_SRA(gvec_usra_s, uint32_t)
1974 DO_SRA(gvec_usra_d, uint64_t)
1975 
1976 #undef DO_SRA
1977 
1978 #define DO_RSHR(NAME, TYPE)                             \
1979 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1980 {                                                       \
1981     intptr_t i, oprsz = simd_oprsz(desc);               \
1982     int shift = simd_data(desc);                        \
1983     TYPE *d = vd, *n = vn;                              \
1984     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1985         TYPE tmp = n[i] >> (shift - 1);                 \
1986         d[i] = (tmp >> 1) + (tmp & 1);                  \
1987     }                                                   \
1988     clear_tail(d, oprsz, simd_maxsz(desc));             \
1989 }
1990 
1991 DO_RSHR(gvec_srshr_b, int8_t)
1992 DO_RSHR(gvec_srshr_h, int16_t)
1993 DO_RSHR(gvec_srshr_s, int32_t)
1994 DO_RSHR(gvec_srshr_d, int64_t)
1995 
1996 DO_RSHR(gvec_urshr_b, uint8_t)
1997 DO_RSHR(gvec_urshr_h, uint16_t)
1998 DO_RSHR(gvec_urshr_s, uint32_t)
1999 DO_RSHR(gvec_urshr_d, uint64_t)
2000 
2001 #undef DO_RSHR
2002 
2003 #define DO_RSRA(NAME, TYPE)                             \
2004 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2005 {                                                       \
2006     intptr_t i, oprsz = simd_oprsz(desc);               \
2007     int shift = simd_data(desc);                        \
2008     TYPE *d = vd, *n = vn;                              \
2009     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2010         TYPE tmp = n[i] >> (shift - 1);                 \
2011         d[i] += (tmp >> 1) + (tmp & 1);                 \
2012     }                                                   \
2013     clear_tail(d, oprsz, simd_maxsz(desc));             \
2014 }
2015 
2016 DO_RSRA(gvec_srsra_b, int8_t)
2017 DO_RSRA(gvec_srsra_h, int16_t)
2018 DO_RSRA(gvec_srsra_s, int32_t)
2019 DO_RSRA(gvec_srsra_d, int64_t)
2020 
2021 DO_RSRA(gvec_ursra_b, uint8_t)
2022 DO_RSRA(gvec_ursra_h, uint16_t)
2023 DO_RSRA(gvec_ursra_s, uint32_t)
2024 DO_RSRA(gvec_ursra_d, uint64_t)
2025 
2026 #undef DO_RSRA
2027 
2028 #define DO_SRI(NAME, TYPE)                              \
2029 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2030 {                                                       \
2031     intptr_t i, oprsz = simd_oprsz(desc);               \
2032     int shift = simd_data(desc);                        \
2033     TYPE *d = vd, *n = vn;                              \
2034     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2035         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2036     }                                                   \
2037     clear_tail(d, oprsz, simd_maxsz(desc));             \
2038 }
2039 
2040 DO_SRI(gvec_sri_b, uint8_t)
2041 DO_SRI(gvec_sri_h, uint16_t)
2042 DO_SRI(gvec_sri_s, uint32_t)
2043 DO_SRI(gvec_sri_d, uint64_t)
2044 
2045 #undef DO_SRI
2046 
2047 #define DO_SLI(NAME, TYPE)                              \
2048 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2049 {                                                       \
2050     intptr_t i, oprsz = simd_oprsz(desc);               \
2051     int shift = simd_data(desc);                        \
2052     TYPE *d = vd, *n = vn;                              \
2053     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2054         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2055     }                                                   \
2056     clear_tail(d, oprsz, simd_maxsz(desc));             \
2057 }
2058 
2059 DO_SLI(gvec_sli_b, uint8_t)
2060 DO_SLI(gvec_sli_h, uint16_t)
2061 DO_SLI(gvec_sli_s, uint32_t)
2062 DO_SLI(gvec_sli_d, uint64_t)
2063 
2064 #undef DO_SLI
2065 
2066 /*
2067  * Convert float16 to float32, raising no exceptions and
2068  * preserving exceptional values, including SNaN.
2069  * This is effectively an unpack+repack operation.
2070  */
2071 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2072 {
2073     const int f16_bias = 15;
2074     const int f32_bias = 127;
2075     uint32_t sign = extract32(f16, 15, 1);
2076     uint32_t exp = extract32(f16, 10, 5);
2077     uint32_t frac = extract32(f16, 0, 10);
2078 
2079     if (exp == 0x1f) {
2080         /* Inf or NaN */
2081         exp = 0xff;
2082     } else if (exp == 0) {
2083         /* Zero or denormal.  */
2084         if (frac != 0) {
2085             if (fz16) {
2086                 frac = 0;
2087             } else {
2088                 /*
2089                  * Denormal; these are all normal float32.
2090                  * Shift the fraction so that the msb is at bit 11,
2091                  * then remove bit 11 as the implicit bit of the
2092                  * normalized float32.  Note that we still go through
2093                  * the shift for normal numbers below, to put the
2094                  * float32 fraction at the right place.
2095                  */
2096                 int shift = clz32(frac) - 21;
2097                 frac = (frac << shift) & 0x3ff;
2098                 exp = f32_bias - f16_bias - shift + 1;
2099             }
2100         }
2101     } else {
2102         /* Normal number; adjust the bias.  */
2103         exp += f32_bias - f16_bias;
2104     }
2105     sign <<= 31;
2106     exp <<= 23;
2107     frac <<= 23 - 10;
2108 
2109     return sign | exp | frac;
2110 }
2111 
2112 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2113 {
2114     /*
2115      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2116      * Load the 2nd qword iff is_q & is_2.
2117      * Shift to the 2nd dword iff !is_q & is_2.
2118      * For !is_q & !is_2, the upper bits of the result are garbage.
2119      */
2120     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2121 }
2122 
2123 /*
2124  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2125  * as there is not yet SVE versions that might use blocking.
2126  */
2127 
2128 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2129                      uint64_t negx, int negf, uint32_t desc, bool fz16)
2130 {
2131     intptr_t i, oprsz = simd_oprsz(desc);
2132     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2133     int is_q = oprsz == 16;
2134     uint64_t n_4, m_4;
2135 
2136     /*
2137      * Pre-load all of the f16 data, avoiding overlap issues.
2138      * Negate all inputs for AH=0 FMLSL at once.
2139      */
2140     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2141     m_4 = load4_f16(vm, is_q, is_2);
2142 
2143     for (i = 0; i < oprsz / 4; i++) {
2144         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2145         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2146         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2147     }
2148     clear_tail(d, oprsz, simd_maxsz(desc));
2149 }
2150 
2151 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2152                             CPUARMState *env, uint32_t desc)
2153 {
2154     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2155     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2156 
2157     do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc,
2158              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2159 }
2160 
2161 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2162                             CPUARMState *env, uint32_t desc)
2163 {
2164     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2165     uint64_t negx = 0;
2166     int negf = 0;
2167 
2168     if (is_s) {
2169         if (env->vfp.fpcr & FPCR_AH) {
2170             negf = float_muladd_negate_product;
2171         } else {
2172             negx = 0x8000800080008000ull;
2173         }
2174     }
2175     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc,
2176              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2177 }
2178 
2179 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2180                                CPUARMState *env, uint32_t desc)
2181 {
2182     intptr_t i, oprsz = simd_oprsz(desc);
2183     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2184     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2185     float_status *status = &env->vfp.fp_status_a64;
2186     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2187     int negx = 0, negf = 0;
2188 
2189     if (is_s) {
2190         if (env->vfp.fpcr & FPCR_AH) {
2191             negf = float_muladd_negate_product;
2192         } else {
2193             negx = 0x8000;
2194         }
2195     }
2196 
2197     for (i = 0; i < oprsz; i += sizeof(float32)) {
2198         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx;
2199         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2200         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2201         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2202         float32 aa = *(float32 *)(va + H1_4(i));
2203 
2204         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status);
2205     }
2206 }
2207 
2208 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2209                          uint64_t negx, int negf, uint32_t desc, bool fz16)
2210 {
2211     intptr_t i, oprsz = simd_oprsz(desc);
2212     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2213     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2214     int is_q = oprsz == 16;
2215     uint64_t n_4;
2216     float32 m_1;
2217 
2218     /*
2219      * Pre-load all of the f16 data, avoiding overlap issues.
2220      * Negate all inputs for AH=0 FMLSL at once.
2221      */
2222     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2223     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2224 
2225     for (i = 0; i < oprsz / 4; i++) {
2226         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2227         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2228     }
2229     clear_tail(d, oprsz, simd_maxsz(desc));
2230 }
2231 
2232 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2233                                 CPUARMState *env, uint32_t desc)
2234 {
2235     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2236     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2237 
2238     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc,
2239                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2240 }
2241 
2242 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2243                                 CPUARMState *env, uint32_t desc)
2244 {
2245     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2246     uint64_t negx = 0;
2247     int negf = 0;
2248 
2249     if (is_s) {
2250         if (env->vfp.fpcr & FPCR_AH) {
2251             negf = float_muladd_negate_product;
2252         } else {
2253             negx = 0x8000800080008000ull;
2254         }
2255     }
2256     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc,
2257                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2258 }
2259 
2260 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2261                                CPUARMState *env, uint32_t desc)
2262 {
2263     intptr_t i, j, oprsz = simd_oprsz(desc);
2264     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2265     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2266     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2267     float_status *status = &env->vfp.fp_status_a64;
2268     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2269     int negx = 0, negf = 0;
2270 
2271     if (is_s) {
2272         if (env->vfp.fpcr & FPCR_AH) {
2273             negf = float_muladd_negate_product;
2274         } else {
2275             negx = 0x8000;
2276         }
2277     }
2278 
2279     for (i = 0; i < oprsz; i += 16) {
2280         float16 mm_16 = *(float16 *)(vm + i + idx);
2281         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2282 
2283         for (j = 0; j < 16; j += sizeof(float32)) {
2284             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx;
2285             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2286             float32 aa = *(float32 *)(va + H1_4(i + j));
2287 
2288             *(float32 *)(vd + H1_4(i + j)) =
2289                 float32_muladd(nn, mm, aa, negf, status);
2290         }
2291     }
2292 }
2293 
2294 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2295 {
2296     intptr_t i, opr_sz = simd_oprsz(desc);
2297     int8_t *d = vd, *n = vn, *m = vm;
2298 
2299     for (i = 0; i < opr_sz; ++i) {
2300         int8_t mm = m[i];
2301         int8_t nn = n[i];
2302         int8_t res = 0;
2303         if (mm >= 0) {
2304             if (mm < 8) {
2305                 res = nn << mm;
2306             }
2307         } else {
2308             res = nn >> (mm > -8 ? -mm : 7);
2309         }
2310         d[i] = res;
2311     }
2312     clear_tail(d, opr_sz, simd_maxsz(desc));
2313 }
2314 
2315 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2316 {
2317     intptr_t i, opr_sz = simd_oprsz(desc);
2318     int16_t *d = vd, *n = vn, *m = vm;
2319 
2320     for (i = 0; i < opr_sz / 2; ++i) {
2321         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2322         int16_t nn = n[i];
2323         int16_t res = 0;
2324         if (mm >= 0) {
2325             if (mm < 16) {
2326                 res = nn << mm;
2327             }
2328         } else {
2329             res = nn >> (mm > -16 ? -mm : 15);
2330         }
2331         d[i] = res;
2332     }
2333     clear_tail(d, opr_sz, simd_maxsz(desc));
2334 }
2335 
2336 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2337 {
2338     intptr_t i, opr_sz = simd_oprsz(desc);
2339     uint8_t *d = vd, *n = vn, *m = vm;
2340 
2341     for (i = 0; i < opr_sz; ++i) {
2342         int8_t mm = m[i];
2343         uint8_t nn = n[i];
2344         uint8_t res = 0;
2345         if (mm >= 0) {
2346             if (mm < 8) {
2347                 res = nn << mm;
2348             }
2349         } else {
2350             if (mm > -8) {
2351                 res = nn >> -mm;
2352             }
2353         }
2354         d[i] = res;
2355     }
2356     clear_tail(d, opr_sz, simd_maxsz(desc));
2357 }
2358 
2359 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2360 {
2361     intptr_t i, opr_sz = simd_oprsz(desc);
2362     uint16_t *d = vd, *n = vn, *m = vm;
2363 
2364     for (i = 0; i < opr_sz / 2; ++i) {
2365         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2366         uint16_t nn = n[i];
2367         uint16_t res = 0;
2368         if (mm >= 0) {
2369             if (mm < 16) {
2370                 res = nn << mm;
2371             }
2372         } else {
2373             if (mm > -16) {
2374                 res = nn >> -mm;
2375             }
2376         }
2377         d[i] = res;
2378     }
2379     clear_tail(d, opr_sz, simd_maxsz(desc));
2380 }
2381 
2382 /*
2383  * 8x8->8 polynomial multiply.
2384  *
2385  * Polynomial multiplication is like integer multiplication except the
2386  * partial products are XORed, not added.
2387  *
2388  * TODO: expose this as a generic vector operation, as it is a common
2389  * crypto building block.
2390  */
2391 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2392 {
2393     intptr_t i, opr_sz = simd_oprsz(desc);
2394     uint64_t *d = vd, *n = vn, *m = vm;
2395 
2396     for (i = 0; i < opr_sz / 8; ++i) {
2397         d[i] = clmul_8x8_low(n[i], m[i]);
2398     }
2399     clear_tail(d, opr_sz, simd_maxsz(desc));
2400 }
2401 
2402 /*
2403  * 64x64->128 polynomial multiply.
2404  * Because of the lanes are not accessed in strict columns,
2405  * this probably cannot be turned into a generic helper.
2406  */
2407 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2408 {
2409     intptr_t i, opr_sz = simd_oprsz(desc);
2410     intptr_t hi = simd_data(desc);
2411     uint64_t *d = vd, *n = vn, *m = vm;
2412 
2413     for (i = 0; i < opr_sz / 8; i += 2) {
2414         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2415         d[i] = int128_getlo(r);
2416         d[i + 1] = int128_gethi(r);
2417     }
2418     clear_tail(d, opr_sz, simd_maxsz(desc));
2419 }
2420 
2421 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2422 {
2423     int hi = simd_data(desc);
2424     uint64_t *d = vd, *n = vn, *m = vm;
2425     uint64_t nn = n[hi], mm = m[hi];
2426 
2427     d[0] = clmul_8x4_packed(nn, mm);
2428     nn >>= 32;
2429     mm >>= 32;
2430     d[1] = clmul_8x4_packed(nn, mm);
2431 
2432     clear_tail(d, 16, simd_maxsz(desc));
2433 }
2434 
2435 #ifdef TARGET_AARCH64
2436 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2437 {
2438     int shift = simd_data(desc) * 8;
2439     intptr_t i, opr_sz = simd_oprsz(desc);
2440     uint64_t *d = vd, *n = vn, *m = vm;
2441 
2442     for (i = 0; i < opr_sz / 8; ++i) {
2443         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2444     }
2445 }
2446 
2447 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2448 {
2449     intptr_t sel = H4(simd_data(desc));
2450     intptr_t i, opr_sz = simd_oprsz(desc);
2451     uint32_t *n = vn, *m = vm;
2452     uint64_t *d = vd;
2453 
2454     for (i = 0; i < opr_sz / 8; ++i) {
2455         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2456     }
2457 }
2458 #endif
2459 
2460 #define DO_CMP0(NAME, TYPE, OP)                         \
2461 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2462 {                                                       \
2463     intptr_t i, opr_sz = simd_oprsz(desc);              \
2464     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2465         TYPE nn = *(TYPE *)(vn + i);                    \
2466         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2467     }                                                   \
2468     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2469 }
2470 
2471 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2472 DO_CMP0(gvec_clt0_b, int8_t, <)
2473 DO_CMP0(gvec_cle0_b, int8_t, <=)
2474 DO_CMP0(gvec_cgt0_b, int8_t, >)
2475 DO_CMP0(gvec_cge0_b, int8_t, >=)
2476 
2477 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2478 DO_CMP0(gvec_clt0_h, int16_t, <)
2479 DO_CMP0(gvec_cle0_h, int16_t, <=)
2480 DO_CMP0(gvec_cgt0_h, int16_t, >)
2481 DO_CMP0(gvec_cge0_h, int16_t, >=)
2482 
2483 #undef DO_CMP0
2484 
2485 #define DO_ABD(NAME, TYPE)                                      \
2486 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2487 {                                                               \
2488     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2489     TYPE *d = vd, *n = vn, *m = vm;                             \
2490                                                                 \
2491     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2492         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2493     }                                                           \
2494     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2495 }
2496 
2497 DO_ABD(gvec_sabd_b, int8_t)
2498 DO_ABD(gvec_sabd_h, int16_t)
2499 DO_ABD(gvec_sabd_s, int32_t)
2500 DO_ABD(gvec_sabd_d, int64_t)
2501 
2502 DO_ABD(gvec_uabd_b, uint8_t)
2503 DO_ABD(gvec_uabd_h, uint16_t)
2504 DO_ABD(gvec_uabd_s, uint32_t)
2505 DO_ABD(gvec_uabd_d, uint64_t)
2506 
2507 #undef DO_ABD
2508 
2509 #define DO_ABA(NAME, TYPE)                                      \
2510 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2511 {                                                               \
2512     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2513     TYPE *d = vd, *n = vn, *m = vm;                             \
2514                                                                 \
2515     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2516         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2517     }                                                           \
2518     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2519 }
2520 
2521 DO_ABA(gvec_saba_b, int8_t)
2522 DO_ABA(gvec_saba_h, int16_t)
2523 DO_ABA(gvec_saba_s, int32_t)
2524 DO_ABA(gvec_saba_d, int64_t)
2525 
2526 DO_ABA(gvec_uaba_b, uint8_t)
2527 DO_ABA(gvec_uaba_h, uint16_t)
2528 DO_ABA(gvec_uaba_s, uint32_t)
2529 DO_ABA(gvec_uaba_d, uint64_t)
2530 
2531 #undef DO_ABA
2532 
2533 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2534 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2535                   float_status *stat, uint32_t desc)                       \
2536 {                                                                          \
2537     ARMVectorReg scratch;                                                  \
2538     intptr_t oprsz = simd_oprsz(desc);                                     \
2539     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2540     TYPE *d = vd, *n = vn, *m = vm;                                        \
2541     if (unlikely(d == m)) {                                                \
2542         m = memcpy(&scratch, m, oprsz);                                    \
2543     }                                                                      \
2544     for (intptr_t i = 0; i < half; ++i) {                                  \
2545         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2546     }                                                                      \
2547     for (intptr_t i = 0; i < half; ++i) {                                  \
2548         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2549     }                                                                      \
2550     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2551 }
2552 
2553 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2554 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2555 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2556 
2557 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2558 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2559 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2560 
2561 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2562 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2563 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2564 
2565 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2566 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2567 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2568 
2569 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2570 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2571 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2572 
2573 #ifdef TARGET_AARCH64
2574 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2575 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2576 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2577 
2578 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2579 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2580 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2581 #endif
2582 
2583 #undef DO_3OP_PAIR
2584 
2585 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2586 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2587 {                                                               \
2588     ARMVectorReg scratch;                                       \
2589     intptr_t oprsz = simd_oprsz(desc);                          \
2590     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2591     TYPE *d = vd, *n = vn, *m = vm;                             \
2592     if (unlikely(d == m)) {                                     \
2593         m = memcpy(&scratch, m, oprsz);                         \
2594     }                                                           \
2595     for (intptr_t i = 0; i < half; ++i) {                       \
2596         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2597     }                                                           \
2598     for (intptr_t i = 0; i < half; ++i) {                       \
2599         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2600     }                                                           \
2601     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2602 }
2603 
2604 #define ADD(A, B) (A + B)
2605 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2606 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2607 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2608 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2609 #undef  ADD
2610 
2611 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2612 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2613 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2614 
2615 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2616 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2617 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2618 
2619 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2620 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2621 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2622 
2623 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2624 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2625 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2626 
2627 #undef DO_3OP_PAIR
2628 
2629 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2630     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2631     {                                                                   \
2632         intptr_t i, oprsz = simd_oprsz(desc);                           \
2633         int shift = simd_data(desc);                                    \
2634         TYPE *d = vd, *n = vn;                                          \
2635         float_status *fpst = stat;                                      \
2636         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2637             d[i] = FUNC(n[i], shift, fpst);                             \
2638         }                                                               \
2639         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2640     }
2641 
2642 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2643 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2644 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2645 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2646 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2647 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2648 
2649 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2650 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2651 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2652 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2653 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2654 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2655 
2656 #undef DO_VCVT_FIXED
2657 
2658 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2659     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2660     {                                                                   \
2661         intptr_t i, oprsz = simd_oprsz(desc);                           \
2662         uint32_t rmode = simd_data(desc);                               \
2663         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2664         TYPE *d = vd, *n = vn;                                          \
2665         set_float_rounding_mode(rmode, fpst);                           \
2666         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2667             d[i] = FUNC(n[i], 0, fpst);                                 \
2668         }                                                               \
2669         set_float_rounding_mode(prev_rmode, fpst);                      \
2670         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2671     }
2672 
2673 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2674 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2675 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2676 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2677 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2678 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2679 
2680 #undef DO_VCVT_RMODE
2681 
2682 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2683     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2684     {                                                                   \
2685         intptr_t i, oprsz = simd_oprsz(desc);                           \
2686         uint32_t rmode = simd_data(desc);                               \
2687         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2688         TYPE *d = vd, *n = vn;                                          \
2689         set_float_rounding_mode(rmode, fpst);                           \
2690         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2691             d[i] = FUNC(n[i], fpst);                                    \
2692         }                                                               \
2693         set_float_rounding_mode(prev_rmode, fpst);                      \
2694         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2695     }
2696 
2697 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2698 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2699 
2700 #undef DO_VRINT_RMODE
2701 
2702 #ifdef TARGET_AARCH64
2703 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2704 {
2705     const uint8_t *indices = vm;
2706     size_t oprsz = simd_oprsz(desc);
2707     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2708     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2709     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2710     union {
2711         uint8_t b[16];
2712         uint64_t d[2];
2713     } result;
2714 
2715     /*
2716      * We must construct the final result in a temp, lest the output
2717      * overlaps the input table.  For TBL, begin with zero; for TBX,
2718      * begin with the original register contents.  Note that we always
2719      * copy 16 bytes here to avoid an extra branch; clearing the high
2720      * bits of the register for oprsz == 8 is handled below.
2721      */
2722     if (is_tbx) {
2723         memcpy(&result, vd, 16);
2724     } else {
2725         memset(&result, 0, 16);
2726     }
2727 
2728     for (size_t i = 0; i < oprsz; ++i) {
2729         uint32_t index = indices[H1(i)];
2730 
2731         if (index < table_len) {
2732             /*
2733              * Convert index (a byte offset into the virtual table
2734              * which is a series of 128-bit vectors concatenated)
2735              * into the correct register element, bearing in mind
2736              * that the table can wrap around from V31 to V0.
2737              */
2738             const uint8_t *table = (const uint8_t *)
2739                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2740             result.b[H1(i)] = table[H1(index % 16)];
2741         }
2742     }
2743 
2744     memcpy(vd, &result, 16);
2745     clear_tail(vd, oprsz, simd_maxsz(desc));
2746 }
2747 #endif
2748 
2749 /*
2750  * NxN -> N highpart multiply
2751  *
2752  * TODO: expose this as a generic vector operation.
2753  */
2754 
2755 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2756 {
2757     intptr_t i, opr_sz = simd_oprsz(desc);
2758     int8_t *d = vd, *n = vn, *m = vm;
2759 
2760     for (i = 0; i < opr_sz; ++i) {
2761         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2762     }
2763     clear_tail(d, opr_sz, simd_maxsz(desc));
2764 }
2765 
2766 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2767 {
2768     intptr_t i, opr_sz = simd_oprsz(desc);
2769     int16_t *d = vd, *n = vn, *m = vm;
2770 
2771     for (i = 0; i < opr_sz / 2; ++i) {
2772         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2773     }
2774     clear_tail(d, opr_sz, simd_maxsz(desc));
2775 }
2776 
2777 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2778 {
2779     intptr_t i, opr_sz = simd_oprsz(desc);
2780     int32_t *d = vd, *n = vn, *m = vm;
2781 
2782     for (i = 0; i < opr_sz / 4; ++i) {
2783         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2784     }
2785     clear_tail(d, opr_sz, simd_maxsz(desc));
2786 }
2787 
2788 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2789 {
2790     intptr_t i, opr_sz = simd_oprsz(desc);
2791     uint64_t *d = vd, *n = vn, *m = vm;
2792     uint64_t discard;
2793 
2794     for (i = 0; i < opr_sz / 8; ++i) {
2795         muls64(&discard, &d[i], n[i], m[i]);
2796     }
2797     clear_tail(d, opr_sz, simd_maxsz(desc));
2798 }
2799 
2800 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2801 {
2802     intptr_t i, opr_sz = simd_oprsz(desc);
2803     uint8_t *d = vd, *n = vn, *m = vm;
2804 
2805     for (i = 0; i < opr_sz; ++i) {
2806         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2807     }
2808     clear_tail(d, opr_sz, simd_maxsz(desc));
2809 }
2810 
2811 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2812 {
2813     intptr_t i, opr_sz = simd_oprsz(desc);
2814     uint16_t *d = vd, *n = vn, *m = vm;
2815 
2816     for (i = 0; i < opr_sz / 2; ++i) {
2817         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2818     }
2819     clear_tail(d, opr_sz, simd_maxsz(desc));
2820 }
2821 
2822 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2823 {
2824     intptr_t i, opr_sz = simd_oprsz(desc);
2825     uint32_t *d = vd, *n = vn, *m = vm;
2826 
2827     for (i = 0; i < opr_sz / 4; ++i) {
2828         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2829     }
2830     clear_tail(d, opr_sz, simd_maxsz(desc));
2831 }
2832 
2833 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2834 {
2835     intptr_t i, opr_sz = simd_oprsz(desc);
2836     uint64_t *d = vd, *n = vn, *m = vm;
2837     uint64_t discard;
2838 
2839     for (i = 0; i < opr_sz / 8; ++i) {
2840         mulu64(&discard, &d[i], n[i], m[i]);
2841     }
2842     clear_tail(d, opr_sz, simd_maxsz(desc));
2843 }
2844 
2845 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2846 {
2847     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2848     int shr = simd_data(desc);
2849     uint64_t *d = vd, *n = vn, *m = vm;
2850 
2851     for (i = 0; i < opr_sz; ++i) {
2852         d[i] = ror64(n[i] ^ m[i], shr);
2853     }
2854     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2855 }
2856 
2857 /*
2858  * Integer matrix-multiply accumulate
2859  */
2860 
2861 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2862 {
2863     int8_t *n = vn, *m = vm;
2864 
2865     for (intptr_t k = 0; k < 8; ++k) {
2866         sum += n[H1(k)] * m[H1(k)];
2867     }
2868     return sum;
2869 }
2870 
2871 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2872 {
2873     uint8_t *n = vn, *m = vm;
2874 
2875     for (intptr_t k = 0; k < 8; ++k) {
2876         sum += n[H1(k)] * m[H1(k)];
2877     }
2878     return sum;
2879 }
2880 
2881 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2882 {
2883     uint8_t *n = vn;
2884     int8_t *m = vm;
2885 
2886     for (intptr_t k = 0; k < 8; ++k) {
2887         sum += n[H1(k)] * m[H1(k)];
2888     }
2889     return sum;
2890 }
2891 
2892 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2893                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2894 {
2895     intptr_t seg, opr_sz = simd_oprsz(desc);
2896 
2897     for (seg = 0; seg < opr_sz; seg += 16) {
2898         uint32_t *d = vd + seg;
2899         uint32_t *a = va + seg;
2900         uint32_t sum0, sum1, sum2, sum3;
2901 
2902         /*
2903          * Process the entire segment at once, writing back the
2904          * results only after we've consumed all of the inputs.
2905          *
2906          * Key to indices by column:
2907          *          i   j                  i             j
2908          */
2909         sum0 = a[H4(0 + 0)];
2910         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2911         sum1 = a[H4(0 + 1)];
2912         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2913         sum2 = a[H4(2 + 0)];
2914         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2915         sum3 = a[H4(2 + 1)];
2916         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2917 
2918         d[H4(0)] = sum0;
2919         d[H4(1)] = sum1;
2920         d[H4(2)] = sum2;
2921         d[H4(3)] = sum3;
2922     }
2923     clear_tail(vd, opr_sz, simd_maxsz(desc));
2924 }
2925 
2926 #define DO_MMLA_B(NAME, INNER) \
2927     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2928     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2929 
2930 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2931 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2932 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2933 
2934 /*
2935  * BFloat16 Dot Product
2936  */
2937 
2938 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2939 {
2940     /*
2941      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2942      * For EBF = 0, we ignore the FPCR bits which determine rounding
2943      * mode and denormal-flushing, and we do unfused multiplies and
2944      * additions with intermediate rounding of all products and sums.
2945      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2946      * and we perform a fused two-way sum-of-products without intermediate
2947      * rounding of the products.
2948      * In either case, we don't set fp exception flags.
2949      *
2950      * EBF is AArch64 only, so even if it's set in the FPCR it has
2951      * no effect on AArch32 instructions.
2952      */
2953     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2954 
2955     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2956     set_default_nan_mode(true, statusp);
2957 
2958     if (ebf) {
2959         /* EBF=1 needs to do a step with round-to-odd semantics */
2960         *oddstatusp = *statusp;
2961         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2962     } else {
2963         set_flush_to_zero(true, statusp);
2964         set_flush_inputs_to_zero(true, statusp);
2965         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2966     }
2967     return ebf;
2968 }
2969 
2970 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2971 {
2972     float32 t1, t2;
2973 
2974     /*
2975      * Extract each BFloat16 from the element pair, and shift
2976      * them such that they become float32.
2977      */
2978     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2979     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2980     t1 = float32_add(t1, t2, fpst);
2981     t1 = float32_add(sum, t1, fpst);
2982 
2983     return t1;
2984 }
2985 
2986 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2987                      float_status *fpst, float_status *fpst_odd)
2988 {
2989     /*
2990      * Compare f16_dotadd() in sme_helper.c, but here we have
2991      * bfloat16 inputs. In particular that means that we do not
2992      * want the FPCR.FZ16 flush semantics, so we use the normal
2993      * float_status for the input handling here.
2994      */
2995     float64 e1r = float32_to_float64(e1 << 16, fpst);
2996     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2997     float64 e2r = float32_to_float64(e2 << 16, fpst);
2998     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2999     float64 t64;
3000     float32 t32;
3001 
3002     /*
3003      * The ARM pseudocode function FPDot performs both multiplies
3004      * and the add with a single rounding operation.  Emulate this
3005      * by performing the first multiply in round-to-odd, then doing
3006      * the second multiply as fused multiply-add, and rounding to
3007      * float32 all in one step.
3008      */
3009     t64 = float64_mul(e1r, e2r, fpst_odd);
3010     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
3011 
3012     /* This conversion is exact, because we've already rounded. */
3013     t32 = float64_to_float32(t64, fpst);
3014 
3015     /* The final accumulation step is not fused. */
3016     return float32_add(sum, t32, fpst);
3017 }
3018 
3019 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
3020                         CPUARMState *env, uint32_t desc)
3021 {
3022     intptr_t i, opr_sz = simd_oprsz(desc);
3023     float32 *d = vd, *a = va;
3024     uint32_t *n = vn, *m = vm;
3025     float_status fpst, fpst_odd;
3026 
3027     if (is_ebf(env, &fpst, &fpst_odd)) {
3028         for (i = 0; i < opr_sz / 4; ++i) {
3029             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
3030         }
3031     } else {
3032         for (i = 0; i < opr_sz / 4; ++i) {
3033             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
3034         }
3035     }
3036     clear_tail(d, opr_sz, simd_maxsz(desc));
3037 }
3038 
3039 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
3040                             void *va, CPUARMState *env, uint32_t desc)
3041 {
3042     intptr_t i, j, opr_sz = simd_oprsz(desc);
3043     intptr_t index = simd_data(desc);
3044     intptr_t elements = opr_sz / 4;
3045     intptr_t eltspersegment = MIN(16 / 4, elements);
3046     float32 *d = vd, *a = va;
3047     uint32_t *n = vn, *m = vm;
3048     float_status fpst, fpst_odd;
3049 
3050     if (is_ebf(env, &fpst, &fpst_odd)) {
3051         for (i = 0; i < elements; i += eltspersegment) {
3052             uint32_t m_idx = m[i + H4(index)];
3053 
3054             for (j = i; j < i + eltspersegment; j++) {
3055                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3056             }
3057         }
3058     } else {
3059         for (i = 0; i < elements; i += eltspersegment) {
3060             uint32_t m_idx = m[i + H4(index)];
3061 
3062             for (j = i; j < i + eltspersegment; j++) {
3063                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3064             }
3065         }
3066     }
3067     clear_tail(d, opr_sz, simd_maxsz(desc));
3068 }
3069 
3070 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3071                          CPUARMState *env, uint32_t desc)
3072 {
3073     intptr_t s, opr_sz = simd_oprsz(desc);
3074     float32 *d = vd, *a = va;
3075     uint32_t *n = vn, *m = vm;
3076     float_status fpst, fpst_odd;
3077 
3078     if (is_ebf(env, &fpst, &fpst_odd)) {
3079         for (s = 0; s < opr_sz / 4; s += 4) {
3080             float32 sum00, sum01, sum10, sum11;
3081 
3082             /*
3083              * Process the entire segment at once, writing back the
3084              * results only after we've consumed all of the inputs.
3085              *
3086              * Key to indices by column:
3087              *               i   j               i   k             j   k
3088              */
3089             sum00 = a[s + H4(0 + 0)];
3090             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3091             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3092 
3093             sum01 = a[s + H4(0 + 1)];
3094             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3095             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3096 
3097             sum10 = a[s + H4(2 + 0)];
3098             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3099             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3100 
3101             sum11 = a[s + H4(2 + 1)];
3102             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3103             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3104 
3105             d[s + H4(0 + 0)] = sum00;
3106             d[s + H4(0 + 1)] = sum01;
3107             d[s + H4(2 + 0)] = sum10;
3108             d[s + H4(2 + 1)] = sum11;
3109         }
3110     } else {
3111         for (s = 0; s < opr_sz / 4; s += 4) {
3112             float32 sum00, sum01, sum10, sum11;
3113 
3114             /*
3115              * Process the entire segment at once, writing back the
3116              * results only after we've consumed all of the inputs.
3117              *
3118              * Key to indices by column:
3119              *               i   j           i   k             j   k
3120              */
3121             sum00 = a[s + H4(0 + 0)];
3122             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3123             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3124 
3125             sum01 = a[s + H4(0 + 1)];
3126             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3127             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3128 
3129             sum10 = a[s + H4(2 + 0)];
3130             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3131             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3132 
3133             sum11 = a[s + H4(2 + 1)];
3134             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3135             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3136 
3137             d[s + H4(0 + 0)] = sum00;
3138             d[s + H4(0 + 1)] = sum01;
3139             d[s + H4(2 + 0)] = sum10;
3140             d[s + H4(2 + 1)] = sum11;
3141         }
3142     }
3143     clear_tail(d, opr_sz, simd_maxsz(desc));
3144 }
3145 
3146 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3147                          float_status *stat, uint32_t desc)
3148 {
3149     intptr_t i, opr_sz = simd_oprsz(desc);
3150     intptr_t sel = simd_data(desc);
3151     float32 *d = vd, *a = va;
3152     bfloat16 *n = vn, *m = vm;
3153 
3154     for (i = 0; i < opr_sz / 4; ++i) {
3155         float32 nn = n[H2(i * 2 + sel)] << 16;
3156         float32 mm = m[H2(i * 2 + sel)] << 16;
3157         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3158     }
3159     clear_tail(d, opr_sz, simd_maxsz(desc));
3160 }
3161 
3162 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3163                              void *va, float_status *stat, uint32_t desc)
3164 {
3165     intptr_t i, j, opr_sz = simd_oprsz(desc);
3166     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3167     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3168     intptr_t elements = opr_sz / 4;
3169     intptr_t eltspersegment = MIN(16 / 4, elements);
3170     float32 *d = vd, *a = va;
3171     bfloat16 *n = vn, *m = vm;
3172 
3173     for (i = 0; i < elements; i += eltspersegment) {
3174         float32 m_idx = m[H2(2 * i + index)] << 16;
3175 
3176         for (j = i; j < i + eltspersegment; j++) {
3177             float32 n_j = n[H2(2 * j + sel)] << 16;
3178             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3179         }
3180     }
3181     clear_tail(d, opr_sz, simd_maxsz(desc));
3182 }
3183 
3184 #define DO_CLAMP(NAME, TYPE) \
3185 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3186 {                                                                       \
3187     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3188     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3189         TYPE aa = *(TYPE *)(a + i);                                     \
3190         TYPE nn = *(TYPE *)(n + i);                                     \
3191         TYPE mm = *(TYPE *)(m + i);                                     \
3192         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3193         *(TYPE *)(d + i) = dd;                                          \
3194     }                                                                   \
3195     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3196 }
3197 
3198 DO_CLAMP(gvec_sclamp_b, int8_t)
3199 DO_CLAMP(gvec_sclamp_h, int16_t)
3200 DO_CLAMP(gvec_sclamp_s, int32_t)
3201 DO_CLAMP(gvec_sclamp_d, int64_t)
3202 
3203 DO_CLAMP(gvec_uclamp_b, uint8_t)
3204 DO_CLAMP(gvec_uclamp_h, uint16_t)
3205 DO_CLAMP(gvec_uclamp_s, uint32_t)
3206 DO_CLAMP(gvec_uclamp_d, uint64_t)
3207 
3208 /* Bit count in each 8-bit word. */
3209 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3210 {
3211     intptr_t i, opr_sz = simd_oprsz(desc);
3212     uint8_t *d = vd, *n = vn;
3213 
3214     for (i = 0; i < opr_sz; ++i) {
3215         d[i] = ctpop8(n[i]);
3216     }
3217     clear_tail(d, opr_sz, simd_maxsz(desc));
3218 }
3219 
3220 /* Reverse bits in each 8 bit word */
3221 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3222 {
3223     intptr_t i, opr_sz = simd_oprsz(desc);
3224     uint64_t *d = vd, *n = vn;
3225 
3226     for (i = 0; i < opr_sz / 8; ++i) {
3227         d[i] = revbit64(bswap64(n[i]));
3228     }
3229     clear_tail(d, opr_sz, simd_maxsz(desc));
3230 }
3231 
3232 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3233 {
3234     intptr_t i, opr_sz = simd_oprsz(desc);
3235     uint32_t *d = vd, *n = vn;
3236 
3237     for (i = 0; i < opr_sz / 4; ++i) {
3238         d[i] = helper_recpe_u32(n[i]);
3239     }
3240     clear_tail(d, opr_sz, simd_maxsz(desc));
3241 }
3242 
3243 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3244 {
3245     intptr_t i, opr_sz = simd_oprsz(desc);
3246     uint32_t *d = vd, *n = vn;
3247 
3248     for (i = 0; i < opr_sz / 4; ++i) {
3249         d[i] = helper_rsqrte_u32(n[i]);
3250     }
3251     clear_tail(d, opr_sz, simd_maxsz(desc));
3252 }
3253