xref: /qemu/target/arm/tcg/vec_helper.c (revision 1c349f43b18608d57a72a3d6d5e95b28a1c14470)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
883     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
884     uintptr_t i;
885 
886     for (i = 0; i < opr_sz / 2; i += 2) {
887         float16 e0 = n[H2(i)];
888         float16 e1 = m[H2(i + 1)];
889         float16 e2 = n[H2(i + 1)];
890         float16 e3 = m[H2(i)];
891 
892         if (rot) {
893             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
894         } else {
895             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
896         }
897 
898         d[H2(i)] = float16_add(e0, e1, fpst);
899         d[H2(i + 1)] = float16_add(e2, e3, fpst);
900     }
901     clear_tail(d, opr_sz, simd_maxsz(desc));
902 }
903 
904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
905                          float_status *fpst, uint32_t desc)
906 {
907     uintptr_t opr_sz = simd_oprsz(desc);
908     float32 *d = vd;
909     float32 *n = vn;
910     float32 *m = vm;
911     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
912     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
913     uintptr_t i;
914 
915     for (i = 0; i < opr_sz / 4; i += 2) {
916         float32 e0 = n[H4(i)];
917         float32 e1 = m[H4(i + 1)];
918         float32 e2 = n[H4(i + 1)];
919         float32 e3 = m[H4(i)];
920 
921         if (rot) {
922             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
923         } else {
924             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
925         }
926 
927         d[H4(i)] = float32_add(e0, e1, fpst);
928         d[H4(i + 1)] = float32_add(e2, e3, fpst);
929     }
930     clear_tail(d, opr_sz, simd_maxsz(desc));
931 }
932 
933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
934                          float_status *fpst, uint32_t desc)
935 {
936     uintptr_t opr_sz = simd_oprsz(desc);
937     float64 *d = vd;
938     float64 *n = vn;
939     float64 *m = vm;
940     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
941     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
942     uintptr_t i;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1];
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i];
949 
950         if (rot) {
951             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
952         } else {
953             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
954         }
955 
956         d[i] = float64_add(e0, e1, fpst);
957         d[i + 1] = float64_add(e2, e3, fpst);
958     }
959     clear_tail(d, opr_sz, simd_maxsz(desc));
960 }
961 
962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
963                          float_status *fpst, uint32_t desc)
964 {
965     uintptr_t opr_sz = simd_oprsz(desc);
966     float16 *d = vd, *n = vn, *m = vm, *a = va;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
969     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
970     uint32_t negf_real = flip ^ negf_imag;
971     float16 negx_imag, negx_real;
972     uintptr_t i;
973 
974     /* With AH=0, use negx; with AH=1 use negf. */
975     negx_real = (negf_real & ~fpcr_ah) << 15;
976     negx_imag = (negf_imag & ~fpcr_ah) << 15;
977     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
978     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
979 
980     for (i = 0; i < opr_sz / 2; i += 2) {
981         float16 e2 = n[H2(i + flip)];
982         float16 e1 = m[H2(i + flip)] ^ negx_real;
983         float16 e4 = e2;
984         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
985 
986         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
987         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
988     }
989     clear_tail(d, opr_sz, simd_maxsz(desc));
990 }
991 
992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
993                              float_status *fpst, uint32_t desc)
994 {
995     uintptr_t opr_sz = simd_oprsz(desc);
996     float16 *d = vd, *n = vn, *m = vm, *a = va;
997     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
998     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
999     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1000     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1001     uint32_t negf_real = flip ^ negf_imag;
1002     intptr_t elements = opr_sz / sizeof(float16);
1003     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1004     float16 negx_imag, negx_real;
1005     intptr_t i, j;
1006 
1007     /* With AH=0, use negx; with AH=1 use negf. */
1008     negx_real = (negf_real & ~fpcr_ah) << 15;
1009     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1010     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1011     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1012 
1013     for (i = 0; i < elements; i += eltspersegment) {
1014         float16 mr = m[H2(i + 2 * index + 0)];
1015         float16 mi = m[H2(i + 2 * index + 1)];
1016         float16 e1 = negx_real ^ (flip ? mi : mr);
1017         float16 e3 = negx_imag ^ (flip ? mr : mi);
1018 
1019         for (j = i; j < i + eltspersegment; j += 2) {
1020             float16 e2 = n[H2(j + flip)];
1021             float16 e4 = e2;
1022 
1023             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst);
1024             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst);
1025         }
1026     }
1027     clear_tail(d, opr_sz, simd_maxsz(desc));
1028 }
1029 
1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1031                          float_status *fpst, uint32_t desc)
1032 {
1033     uintptr_t opr_sz = simd_oprsz(desc);
1034     float32 *d = vd, *n = vn, *m = vm, *a = va;
1035     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1036     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1037     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1038     uint32_t negf_real = flip ^ negf_imag;
1039     float32 negx_imag, negx_real;
1040     uintptr_t i;
1041 
1042     /* With AH=0, use negx; with AH=1 use negf. */
1043     negx_real = (negf_real & ~fpcr_ah) << 31;
1044     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1045     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1046     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1047 
1048     for (i = 0; i < opr_sz / 4; i += 2) {
1049         float32 e2 = n[H4(i + flip)];
1050         float32 e1 = m[H4(i + flip)] ^ negx_real;
1051         float32 e4 = e2;
1052         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1053 
1054         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1055         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1056     }
1057     clear_tail(d, opr_sz, simd_maxsz(desc));
1058 }
1059 
1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1061                              float_status *fpst, uint32_t desc)
1062 {
1063     uintptr_t opr_sz = simd_oprsz(desc);
1064     float32 *d = vd, *n = vn, *m = vm, *a = va;
1065     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1066     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1067     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1068     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1069     uint32_t negf_real = flip ^ negf_imag;
1070     intptr_t elements = opr_sz / sizeof(float32);
1071     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1072     float32 negx_imag, negx_real;
1073     intptr_t i, j;
1074 
1075     /* With AH=0, use negx; with AH=1 use negf. */
1076     negx_real = (negf_real & ~fpcr_ah) << 31;
1077     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1078     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1079     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1080 
1081     for (i = 0; i < elements; i += eltspersegment) {
1082         float32 mr = m[H4(i + 2 * index + 0)];
1083         float32 mi = m[H4(i + 2 * index + 1)];
1084         float32 e1 = negx_real ^ (flip ? mi : mr);
1085         float32 e3 = negx_imag ^ (flip ? mr : mi);
1086 
1087         for (j = i; j < i + eltspersegment; j += 2) {
1088             float32 e2 = n[H4(j + flip)];
1089             float32 e4 = e2;
1090 
1091             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst);
1092             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst);
1093         }
1094     }
1095     clear_tail(d, opr_sz, simd_maxsz(desc));
1096 }
1097 
1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1099                          float_status *fpst, uint32_t desc)
1100 {
1101     uintptr_t opr_sz = simd_oprsz(desc);
1102     float64 *d = vd, *n = vn, *m = vm, *a = va;
1103     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1104     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1105     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1106     uint32_t negf_real = flip ^ negf_imag;
1107     float64 negx_real, negx_imag;
1108     uintptr_t i;
1109 
1110     /* With AH=0, use negx; with AH=1 use negf. */
1111     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1112     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1113     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1114     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1115 
1116     for (i = 0; i < opr_sz / 8; i += 2) {
1117         float64 e2 = n[i + flip];
1118         float64 e1 = m[i + flip] ^ negx_real;
1119         float64 e4 = e2;
1120         float64 e3 = m[i + 1 - flip] ^ negx_imag;
1121 
1122         d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1123         d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1124     }
1125     clear_tail(d, opr_sz, simd_maxsz(desc));
1126 }
1127 
1128 /*
1129  * Floating point comparisons producing an integer result (all 1s or all 0s).
1130  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1131  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1132  */
1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1134 {
1135     return -float16_eq_quiet(op1, op2, stat);
1136 }
1137 
1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1139 {
1140     return -float32_eq_quiet(op1, op2, stat);
1141 }
1142 
1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1144 {
1145     return -float64_eq_quiet(op1, op2, stat);
1146 }
1147 
1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1149 {
1150     return -float16_le(op2, op1, stat);
1151 }
1152 
1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1154 {
1155     return -float32_le(op2, op1, stat);
1156 }
1157 
1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1159 {
1160     return -float64_le(op2, op1, stat);
1161 }
1162 
1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1164 {
1165     return -float16_lt(op2, op1, stat);
1166 }
1167 
1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1169 {
1170     return -float32_lt(op2, op1, stat);
1171 }
1172 
1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1174 {
1175     return -float64_lt(op2, op1, stat);
1176 }
1177 
1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1179 {
1180     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1181 }
1182 
1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1184 {
1185     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1186 }
1187 
1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1189 {
1190     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1191 }
1192 
1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1194 {
1195     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1196 }
1197 
1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1199 {
1200     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1201 }
1202 
1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1204 {
1205     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1206 }
1207 
1208 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1209 {
1210     if (float16_is_any_nan(x)) {
1211         float_raise(float_flag_invalid, fpst);
1212         return 0;
1213     }
1214     return float16_to_int16_round_to_zero(x, fpst);
1215 }
1216 
1217 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1218 {
1219     if (float16_is_any_nan(x)) {
1220         float_raise(float_flag_invalid, fpst);
1221         return 0;
1222     }
1223     return float16_to_uint16_round_to_zero(x, fpst);
1224 }
1225 
1226 #define DO_2OP(NAME, FUNC, TYPE) \
1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1228 {                                                                 \
1229     intptr_t i, oprsz = simd_oprsz(desc);                         \
1230     TYPE *d = vd, *n = vn;                                        \
1231     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1232         d[i] = FUNC(n[i], stat);                                  \
1233     }                                                             \
1234     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1235 }
1236 
1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1239 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32)
1240 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1241 
1242 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1243 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1244 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32)
1245 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1246 
1247 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1248 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1249 
1250 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1251 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1252 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1253 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1254 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1255 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1256 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1257 DO_2OP(gvec_touszh, vfp_touszh, float16)
1258 
1259 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1260     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1261     {                                                           \
1262         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1263     }
1264 
1265 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1266     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1267     {                                                           \
1268         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1269     }
1270 
1271 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1272     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1273     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1274     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1275     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1276     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1277     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1278 
1279 DO_2OP_CMP0(cgt, cgt, FWD)
1280 DO_2OP_CMP0(cge, cge, FWD)
1281 DO_2OP_CMP0(ceq, ceq, FWD)
1282 DO_2OP_CMP0(clt, cgt, REV)
1283 DO_2OP_CMP0(cle, cge, REV)
1284 
1285 #undef DO_2OP
1286 #undef DO_2OP_CMP0
1287 
1288 /* Floating-point trigonometric starting value.
1289  * See the ARM ARM pseudocode function FPTrigSMul.
1290  */
1291 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1292 {
1293     float16 result = float16_mul(op1, op1, stat);
1294     if (!float16_is_any_nan(result)) {
1295         result = float16_set_sign(result, op2 & 1);
1296     }
1297     return result;
1298 }
1299 
1300 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1301 {
1302     float32 result = float32_mul(op1, op1, stat);
1303     if (!float32_is_any_nan(result)) {
1304         result = float32_set_sign(result, op2 & 1);
1305     }
1306     return result;
1307 }
1308 
1309 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1310 {
1311     float64 result = float64_mul(op1, op1, stat);
1312     if (!float64_is_any_nan(result)) {
1313         result = float64_set_sign(result, op2 & 1);
1314     }
1315     return result;
1316 }
1317 
1318 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1319 {
1320     return float16_abs(float16_sub(op1, op2, stat));
1321 }
1322 
1323 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1324 {
1325     return float32_abs(float32_sub(op1, op2, stat));
1326 }
1327 
1328 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1329 {
1330     return float64_abs(float64_sub(op1, op2, stat));
1331 }
1332 
1333 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1334 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1335 {
1336     float16 r = float16_sub(op1, op2, stat);
1337     return float16_is_any_nan(r) ? r : float16_abs(r);
1338 }
1339 
1340 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1341 {
1342     float32 r = float32_sub(op1, op2, stat);
1343     return float32_is_any_nan(r) ? r : float32_abs(r);
1344 }
1345 
1346 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1347 {
1348     float64 r = float64_sub(op1, op2, stat);
1349     return float64_is_any_nan(r) ? r : float64_abs(r);
1350 }
1351 
1352 /*
1353  * Reciprocal step. These are the AArch32 version which uses a
1354  * non-fused multiply-and-subtract.
1355  */
1356 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1357 {
1358     op1 = float16_squash_input_denormal(op1, stat);
1359     op2 = float16_squash_input_denormal(op2, stat);
1360 
1361     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1362         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1363         return float16_two;
1364     }
1365     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1366 }
1367 
1368 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1369 {
1370     op1 = float32_squash_input_denormal(op1, stat);
1371     op2 = float32_squash_input_denormal(op2, stat);
1372 
1373     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1374         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1375         return float32_two;
1376     }
1377     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1378 }
1379 
1380 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1381 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1382 {
1383     op1 = float16_squash_input_denormal(op1, stat);
1384     op2 = float16_squash_input_denormal(op2, stat);
1385 
1386     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1387         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1388         return float16_one_point_five;
1389     }
1390     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1391     return float16_div(op1, float16_two, stat);
1392 }
1393 
1394 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1395 {
1396     op1 = float32_squash_input_denormal(op1, stat);
1397     op2 = float32_squash_input_denormal(op2, stat);
1398 
1399     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1400         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1401         return float32_one_point_five;
1402     }
1403     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1404     return float32_div(op1, float32_two, stat);
1405 }
1406 
1407 #define DO_3OP(NAME, FUNC, TYPE) \
1408 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1409                   float_status *stat, uint32_t desc)                       \
1410 {                                                                          \
1411     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1412     TYPE *d = vd, *n = vn, *m = vm;                                        \
1413     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1414         d[i] = FUNC(n[i], m[i], stat);                                     \
1415     }                                                                      \
1416     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1417 }
1418 
1419 DO_3OP(gvec_fadd_h, float16_add, float16)
1420 DO_3OP(gvec_fadd_s, float32_add, float32)
1421 DO_3OP(gvec_fadd_d, float64_add, float64)
1422 
1423 DO_3OP(gvec_fsub_h, float16_sub, float16)
1424 DO_3OP(gvec_fsub_s, float32_sub, float32)
1425 DO_3OP(gvec_fsub_d, float64_sub, float64)
1426 
1427 DO_3OP(gvec_fmul_h, float16_mul, float16)
1428 DO_3OP(gvec_fmul_s, float32_mul, float32)
1429 DO_3OP(gvec_fmul_d, float64_mul, float64)
1430 
1431 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1432 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1433 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1434 
1435 DO_3OP(gvec_fabd_h, float16_abd, float16)
1436 DO_3OP(gvec_fabd_s, float32_abd, float32)
1437 DO_3OP(gvec_fabd_d, float64_abd, float64)
1438 
1439 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1440 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1441 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1442 
1443 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1444 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1445 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1446 
1447 DO_3OP(gvec_fcge_h, float16_cge, float16)
1448 DO_3OP(gvec_fcge_s, float32_cge, float32)
1449 DO_3OP(gvec_fcge_d, float64_cge, float64)
1450 
1451 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1452 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1453 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1454 
1455 DO_3OP(gvec_facge_h, float16_acge, float16)
1456 DO_3OP(gvec_facge_s, float32_acge, float32)
1457 DO_3OP(gvec_facge_d, float64_acge, float64)
1458 
1459 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1460 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1461 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1462 
1463 DO_3OP(gvec_fmax_h, float16_max, float16)
1464 DO_3OP(gvec_fmax_s, float32_max, float32)
1465 DO_3OP(gvec_fmax_d, float64_max, float64)
1466 
1467 DO_3OP(gvec_fmin_h, float16_min, float16)
1468 DO_3OP(gvec_fmin_s, float32_min, float32)
1469 DO_3OP(gvec_fmin_d, float64_min, float64)
1470 
1471 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1472 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1473 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1474 
1475 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1476 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1477 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1478 
1479 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1480 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1481 
1482 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1483 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1484 
1485 #ifdef TARGET_AARCH64
1486 DO_3OP(gvec_fdiv_h, float16_div, float16)
1487 DO_3OP(gvec_fdiv_s, float32_div, float32)
1488 DO_3OP(gvec_fdiv_d, float64_div, float64)
1489 
1490 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1491 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1492 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1493 
1494 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1495 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1496 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1497 
1498 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1499 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1500 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1501 
1502 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1503 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1504 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1505 
1506 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1507 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1508 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1509 
1510 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1511 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1512 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1513 
1514 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1515 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1516 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1517 
1518 #endif
1519 #undef DO_3OP
1520 
1521 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1522 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1523                                  float_status *stat)
1524 {
1525     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1526 }
1527 
1528 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1529                                  float_status *stat)
1530 {
1531     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1532 }
1533 
1534 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1535                                  float_status *stat)
1536 {
1537     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1538 }
1539 
1540 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1541                                  float_status *stat)
1542 {
1543     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1544 }
1545 
1546 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1547 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1548                                 float_status *stat)
1549 {
1550     return float16_muladd(op1, op2, dest, 0, stat);
1551 }
1552 
1553 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1554                                  float_status *stat)
1555 {
1556     return float32_muladd(op1, op2, dest, 0, stat);
1557 }
1558 
1559 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1560                                  float_status *stat)
1561 {
1562     return float64_muladd(op1, op2, dest, 0, stat);
1563 }
1564 
1565 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1566                                  float_status *stat)
1567 {
1568     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1569 }
1570 
1571 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1572                                  float_status *stat)
1573 {
1574     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1575 }
1576 
1577 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1578                                  float_status *stat)
1579 {
1580     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1581 }
1582 
1583 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1584                                  float_status *stat)
1585 {
1586     return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1587 }
1588 
1589 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1590                                  float_status *stat)
1591 {
1592     return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1593 }
1594 
1595 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1596                                  float_status *stat)
1597 {
1598     return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1599 }
1600 
1601 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1602 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1603                   float_status *stat, uint32_t desc)                       \
1604 {                                                                          \
1605     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1606     TYPE *d = vd, *n = vn, *m = vm;                                        \
1607     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1608         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1609     }                                                                      \
1610     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1611 }
1612 
1613 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1614 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1615 
1616 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1617 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1618 
1619 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1620 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1621 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1622 
1623 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1624 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1625 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1626 
1627 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1628 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1629 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1630 
1631 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1632  * For AdvSIMD, there is of course only one such vector segment.
1633  */
1634 
1635 #define DO_MUL_IDX(NAME, TYPE, H) \
1636 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1637 {                                                                          \
1638     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1639     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1640     intptr_t idx = simd_data(desc);                                        \
1641     TYPE *d = vd, *n = vn, *m = vm;                                        \
1642     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1643         TYPE mm = m[H(i + idx)];                                           \
1644         for (j = 0; j < segment; j++) {                                    \
1645             d[i + j] = n[i + j] * mm;                                      \
1646         }                                                                  \
1647     }                                                                      \
1648     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1649 }
1650 
1651 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1652 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1653 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1654 
1655 #undef DO_MUL_IDX
1656 
1657 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1658 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1659 {                                                                          \
1660     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1661     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1662     intptr_t idx = simd_data(desc);                                        \
1663     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1664     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1665         TYPE mm = m[H(i + idx)];                                           \
1666         for (j = 0; j < segment; j++) {                                    \
1667             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1668         }                                                                  \
1669     }                                                                      \
1670     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1671 }
1672 
1673 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1674 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1675 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1676 
1677 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1678 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1679 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1680 
1681 #undef DO_MLA_IDX
1682 
1683 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1684 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1685                   float_status *stat, uint32_t desc)                       \
1686 {                                                                          \
1687     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1688     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1689     intptr_t idx = simd_data(desc);                                        \
1690     TYPE *d = vd, *n = vn, *m = vm;                                        \
1691     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1692         TYPE mm = m[H(i + idx)];                                           \
1693         for (j = 0; j < segment; j++) {                                    \
1694             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1695         }                                                                  \
1696     }                                                                      \
1697     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1698 }
1699 
1700 #define nop(N, M, S) (M)
1701 
1702 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1703 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1704 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1705 
1706 #ifdef TARGET_AARCH64
1707 
1708 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1709 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1710 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1711 
1712 #endif
1713 
1714 #undef nop
1715 
1716 /*
1717  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1718  * the fused ops below they assume accumulate both from and into Vd.
1719  */
1720 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1721 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1722 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1723 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1724 
1725 #undef DO_FMUL_IDX
1726 
1727 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF)                             \
1728 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1729                   float_status *stat, uint32_t desc)                       \
1730 {                                                                          \
1731     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1732     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1733     intptr_t idx = simd_data(desc);                                        \
1734     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1735     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1736         TYPE mm = m[H(i + idx)];                                           \
1737         for (j = 0; j < segment; j++) {                                    \
1738             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1739                                      a[i + j], NEGF, stat);                \
1740         }                                                                  \
1741     }                                                                      \
1742     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1743 }
1744 
1745 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1746 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1747 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1748 
1749 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1750 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1751 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1752 
1753 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1754 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1755 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1756 
1757 #undef DO_FMLA_IDX
1758 
1759 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1760 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1761 {                                                                          \
1762     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1763     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1764     bool q = false;                                                        \
1765     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1766         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1767         if (dd < MIN) {                                                    \
1768             dd = MIN;                                                      \
1769             q = true;                                                      \
1770         } else if (dd > MAX) {                                             \
1771             dd = MAX;                                                      \
1772             q = true;                                                      \
1773         }                                                                  \
1774         d[i] = dd;                                                         \
1775     }                                                                      \
1776     if (q) {                                                               \
1777         uint32_t *qc = vq;                                                 \
1778         qc[0] = 1;                                                         \
1779     }                                                                      \
1780     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1781 }
1782 
1783 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1784 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1785 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1786 
1787 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1788 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1789 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1790 
1791 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1792 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1793 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1794 
1795 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1796 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1797 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1798 
1799 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1800 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1801 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1802 
1803 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1804 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1805 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1806 
1807 #undef DO_SAT
1808 
1809 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1810                           void *vm, uint32_t desc)
1811 {
1812     intptr_t i, oprsz = simd_oprsz(desc);
1813     uint64_t *d = vd, *n = vn, *m = vm;
1814     bool q = false;
1815 
1816     for (i = 0; i < oprsz / 8; i++) {
1817         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1818         if (dd < nn) {
1819             dd = UINT64_MAX;
1820             q = true;
1821         }
1822         d[i] = dd;
1823     }
1824     if (q) {
1825         uint32_t *qc = vq;
1826         qc[0] = 1;
1827     }
1828     clear_tail(d, oprsz, simd_maxsz(desc));
1829 }
1830 
1831 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1832                           void *vm, uint32_t desc)
1833 {
1834     intptr_t i, oprsz = simd_oprsz(desc);
1835     uint64_t *d = vd, *n = vn, *m = vm;
1836     bool q = false;
1837 
1838     for (i = 0; i < oprsz / 8; i++) {
1839         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1840         if (nn < mm) {
1841             dd = 0;
1842             q = true;
1843         }
1844         d[i] = dd;
1845     }
1846     if (q) {
1847         uint32_t *qc = vq;
1848         qc[0] = 1;
1849     }
1850     clear_tail(d, oprsz, simd_maxsz(desc));
1851 }
1852 
1853 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1854                           void *vm, uint32_t desc)
1855 {
1856     intptr_t i, oprsz = simd_oprsz(desc);
1857     int64_t *d = vd, *n = vn, *m = vm;
1858     bool q = false;
1859 
1860     for (i = 0; i < oprsz / 8; i++) {
1861         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1862         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1863             dd = (nn >> 63) ^ ~INT64_MIN;
1864             q = true;
1865         }
1866         d[i] = dd;
1867     }
1868     if (q) {
1869         uint32_t *qc = vq;
1870         qc[0] = 1;
1871     }
1872     clear_tail(d, oprsz, simd_maxsz(desc));
1873 }
1874 
1875 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1876                           void *vm, uint32_t desc)
1877 {
1878     intptr_t i, oprsz = simd_oprsz(desc);
1879     int64_t *d = vd, *n = vn, *m = vm;
1880     bool q = false;
1881 
1882     for (i = 0; i < oprsz / 8; i++) {
1883         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1884         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1885             dd = (nn >> 63) ^ ~INT64_MIN;
1886             q = true;
1887         }
1888         d[i] = dd;
1889     }
1890     if (q) {
1891         uint32_t *qc = vq;
1892         qc[0] = 1;
1893     }
1894     clear_tail(d, oprsz, simd_maxsz(desc));
1895 }
1896 
1897 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1898                            void *vm, uint32_t desc)
1899 {
1900     intptr_t i, oprsz = simd_oprsz(desc);
1901     uint64_t *d = vd, *n = vn, *m = vm;
1902     bool q = false;
1903 
1904     for (i = 0; i < oprsz / 8; i++) {
1905         uint64_t nn = n[i];
1906         int64_t mm = m[i];
1907         uint64_t dd = nn + mm;
1908 
1909         if (mm < 0) {
1910             if (nn < (uint64_t)-mm) {
1911                 dd = 0;
1912                 q = true;
1913             }
1914         } else {
1915             if (dd < nn) {
1916                 dd = UINT64_MAX;
1917                 q = true;
1918             }
1919         }
1920         d[i] = dd;
1921     }
1922     if (q) {
1923         uint32_t *qc = vq;
1924         qc[0] = 1;
1925     }
1926     clear_tail(d, oprsz, simd_maxsz(desc));
1927 }
1928 
1929 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1930                            void *vm, uint32_t desc)
1931 {
1932     intptr_t i, oprsz = simd_oprsz(desc);
1933     uint64_t *d = vd, *n = vn, *m = vm;
1934     bool q = false;
1935 
1936     for (i = 0; i < oprsz / 8; i++) {
1937         int64_t nn = n[i];
1938         uint64_t mm = m[i];
1939         int64_t dd = nn + mm;
1940 
1941         if (mm > (uint64_t)(INT64_MAX - nn)) {
1942             dd = INT64_MAX;
1943             q = true;
1944         }
1945         d[i] = dd;
1946     }
1947     if (q) {
1948         uint32_t *qc = vq;
1949         qc[0] = 1;
1950     }
1951     clear_tail(d, oprsz, simd_maxsz(desc));
1952 }
1953 
1954 #define DO_SRA(NAME, TYPE)                              \
1955 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1956 {                                                       \
1957     intptr_t i, oprsz = simd_oprsz(desc);               \
1958     int shift = simd_data(desc);                        \
1959     TYPE *d = vd, *n = vn;                              \
1960     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1961         d[i] += n[i] >> shift;                          \
1962     }                                                   \
1963     clear_tail(d, oprsz, simd_maxsz(desc));             \
1964 }
1965 
1966 DO_SRA(gvec_ssra_b, int8_t)
1967 DO_SRA(gvec_ssra_h, int16_t)
1968 DO_SRA(gvec_ssra_s, int32_t)
1969 DO_SRA(gvec_ssra_d, int64_t)
1970 
1971 DO_SRA(gvec_usra_b, uint8_t)
1972 DO_SRA(gvec_usra_h, uint16_t)
1973 DO_SRA(gvec_usra_s, uint32_t)
1974 DO_SRA(gvec_usra_d, uint64_t)
1975 
1976 #undef DO_SRA
1977 
1978 #define DO_RSHR(NAME, TYPE)                             \
1979 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1980 {                                                       \
1981     intptr_t i, oprsz = simd_oprsz(desc);               \
1982     int shift = simd_data(desc);                        \
1983     TYPE *d = vd, *n = vn;                              \
1984     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1985         TYPE tmp = n[i] >> (shift - 1);                 \
1986         d[i] = (tmp >> 1) + (tmp & 1);                  \
1987     }                                                   \
1988     clear_tail(d, oprsz, simd_maxsz(desc));             \
1989 }
1990 
1991 DO_RSHR(gvec_srshr_b, int8_t)
1992 DO_RSHR(gvec_srshr_h, int16_t)
1993 DO_RSHR(gvec_srshr_s, int32_t)
1994 DO_RSHR(gvec_srshr_d, int64_t)
1995 
1996 DO_RSHR(gvec_urshr_b, uint8_t)
1997 DO_RSHR(gvec_urshr_h, uint16_t)
1998 DO_RSHR(gvec_urshr_s, uint32_t)
1999 DO_RSHR(gvec_urshr_d, uint64_t)
2000 
2001 #undef DO_RSHR
2002 
2003 #define DO_RSRA(NAME, TYPE)                             \
2004 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2005 {                                                       \
2006     intptr_t i, oprsz = simd_oprsz(desc);               \
2007     int shift = simd_data(desc);                        \
2008     TYPE *d = vd, *n = vn;                              \
2009     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2010         TYPE tmp = n[i] >> (shift - 1);                 \
2011         d[i] += (tmp >> 1) + (tmp & 1);                 \
2012     }                                                   \
2013     clear_tail(d, oprsz, simd_maxsz(desc));             \
2014 }
2015 
2016 DO_RSRA(gvec_srsra_b, int8_t)
2017 DO_RSRA(gvec_srsra_h, int16_t)
2018 DO_RSRA(gvec_srsra_s, int32_t)
2019 DO_RSRA(gvec_srsra_d, int64_t)
2020 
2021 DO_RSRA(gvec_ursra_b, uint8_t)
2022 DO_RSRA(gvec_ursra_h, uint16_t)
2023 DO_RSRA(gvec_ursra_s, uint32_t)
2024 DO_RSRA(gvec_ursra_d, uint64_t)
2025 
2026 #undef DO_RSRA
2027 
2028 #define DO_SRI(NAME, TYPE)                              \
2029 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2030 {                                                       \
2031     intptr_t i, oprsz = simd_oprsz(desc);               \
2032     int shift = simd_data(desc);                        \
2033     TYPE *d = vd, *n = vn;                              \
2034     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2035         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2036     }                                                   \
2037     clear_tail(d, oprsz, simd_maxsz(desc));             \
2038 }
2039 
2040 DO_SRI(gvec_sri_b, uint8_t)
2041 DO_SRI(gvec_sri_h, uint16_t)
2042 DO_SRI(gvec_sri_s, uint32_t)
2043 DO_SRI(gvec_sri_d, uint64_t)
2044 
2045 #undef DO_SRI
2046 
2047 #define DO_SLI(NAME, TYPE)                              \
2048 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2049 {                                                       \
2050     intptr_t i, oprsz = simd_oprsz(desc);               \
2051     int shift = simd_data(desc);                        \
2052     TYPE *d = vd, *n = vn;                              \
2053     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2054         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2055     }                                                   \
2056     clear_tail(d, oprsz, simd_maxsz(desc));             \
2057 }
2058 
2059 DO_SLI(gvec_sli_b, uint8_t)
2060 DO_SLI(gvec_sli_h, uint16_t)
2061 DO_SLI(gvec_sli_s, uint32_t)
2062 DO_SLI(gvec_sli_d, uint64_t)
2063 
2064 #undef DO_SLI
2065 
2066 /*
2067  * Convert float16 to float32, raising no exceptions and
2068  * preserving exceptional values, including SNaN.
2069  * This is effectively an unpack+repack operation.
2070  */
2071 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2072 {
2073     const int f16_bias = 15;
2074     const int f32_bias = 127;
2075     uint32_t sign = extract32(f16, 15, 1);
2076     uint32_t exp = extract32(f16, 10, 5);
2077     uint32_t frac = extract32(f16, 0, 10);
2078 
2079     if (exp == 0x1f) {
2080         /* Inf or NaN */
2081         exp = 0xff;
2082     } else if (exp == 0) {
2083         /* Zero or denormal.  */
2084         if (frac != 0) {
2085             if (fz16) {
2086                 frac = 0;
2087             } else {
2088                 /*
2089                  * Denormal; these are all normal float32.
2090                  * Shift the fraction so that the msb is at bit 11,
2091                  * then remove bit 11 as the implicit bit of the
2092                  * normalized float32.  Note that we still go through
2093                  * the shift for normal numbers below, to put the
2094                  * float32 fraction at the right place.
2095                  */
2096                 int shift = clz32(frac) - 21;
2097                 frac = (frac << shift) & 0x3ff;
2098                 exp = f32_bias - f16_bias - shift + 1;
2099             }
2100         }
2101     } else {
2102         /* Normal number; adjust the bias.  */
2103         exp += f32_bias - f16_bias;
2104     }
2105     sign <<= 31;
2106     exp <<= 23;
2107     frac <<= 23 - 10;
2108 
2109     return sign | exp | frac;
2110 }
2111 
2112 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2113 {
2114     /*
2115      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2116      * Load the 2nd qword iff is_q & is_2.
2117      * Shift to the 2nd dword iff !is_q & is_2.
2118      * For !is_q & !is_2, the upper bits of the result are garbage.
2119      */
2120     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2121 }
2122 
2123 /*
2124  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2125  * as there is not yet SVE versions that might use blocking.
2126  */
2127 
2128 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2129                      uint64_t negx, int negf, uint32_t desc, bool fz16)
2130 {
2131     intptr_t i, oprsz = simd_oprsz(desc);
2132     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2133     int is_q = oprsz == 16;
2134     uint64_t n_4, m_4;
2135 
2136     /*
2137      * Pre-load all of the f16 data, avoiding overlap issues.
2138      * Negate all inputs for AH=0 FMLSL at once.
2139      */
2140     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2141     m_4 = load4_f16(vm, is_q, is_2);
2142 
2143     for (i = 0; i < oprsz / 4; i++) {
2144         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2145         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2146         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2147     }
2148     clear_tail(d, oprsz, simd_maxsz(desc));
2149 }
2150 
2151 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2152                             CPUARMState *env, uint32_t desc)
2153 {
2154     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2155     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2156 
2157     do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc,
2158              get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A32_F16]));
2159 }
2160 
2161 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2162                             CPUARMState *env, uint32_t desc)
2163 {
2164     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2165     uint64_t negx = 0;
2166     int negf = 0;
2167 
2168     if (is_s) {
2169         if (env->vfp.fpcr & FPCR_AH) {
2170             negf = float_muladd_negate_product;
2171         } else {
2172             negx = 0x8000800080008000ull;
2173         }
2174     }
2175     do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_A64], negx, negf, desc,
2176              get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16]));
2177 }
2178 
2179 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2180                                CPUARMState *env, uint32_t desc)
2181 {
2182     intptr_t i, oprsz = simd_oprsz(desc);
2183     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2184     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2185     float_status *status = &env->vfp.fp_status[FPST_A64];
2186     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16]);
2187     int negx = 0, negf = 0;
2188 
2189     if (is_s) {
2190         if (env->vfp.fpcr & FPCR_AH) {
2191             negf = float_muladd_negate_product;
2192         } else {
2193             negx = 0x8000;
2194         }
2195     }
2196 
2197     for (i = 0; i < oprsz; i += sizeof(float32)) {
2198         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx;
2199         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2200         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2201         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2202         float32 aa = *(float32 *)(va + H1_4(i));
2203 
2204         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status);
2205     }
2206 }
2207 
2208 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2209                          uint64_t negx, int negf, uint32_t desc, bool fz16)
2210 {
2211     intptr_t i, oprsz = simd_oprsz(desc);
2212     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2213     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2214     int is_q = oprsz == 16;
2215     uint64_t n_4;
2216     float32 m_1;
2217 
2218     /*
2219      * Pre-load all of the f16 data, avoiding overlap issues.
2220      * Negate all inputs for AH=0 FMLSL at once.
2221      */
2222     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2223     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2224 
2225     for (i = 0; i < oprsz / 4; i++) {
2226         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2227         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2228     }
2229     clear_tail(d, oprsz, simd_maxsz(desc));
2230 }
2231 
2232 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2233                                 CPUARMState *env, uint32_t desc)
2234 {
2235     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2236     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2237 
2238     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc,
2239                  get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A32_F16]));
2240 }
2241 
2242 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2243                                 CPUARMState *env, uint32_t desc)
2244 {
2245     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2246     uint64_t negx = 0;
2247     int negf = 0;
2248 
2249     if (is_s) {
2250         if (env->vfp.fpcr & FPCR_AH) {
2251             negf = float_muladd_negate_product;
2252         } else {
2253             negx = 0x8000800080008000ull;
2254         }
2255     }
2256     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_A64], negx, negf, desc,
2257                  get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16]));
2258 }
2259 
2260 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2261                                CPUARMState *env, uint32_t desc)
2262 {
2263     intptr_t i, j, oprsz = simd_oprsz(desc);
2264     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2265     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2266     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2267     float_status *status = &env->vfp.fp_status[FPST_A64];
2268     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16]);
2269     int negx = 0, negf = 0;
2270 
2271     if (is_s) {
2272         if (env->vfp.fpcr & FPCR_AH) {
2273             negf = float_muladd_negate_product;
2274         } else {
2275             negx = 0x8000;
2276         }
2277     }
2278     for (i = 0; i < oprsz; i += 16) {
2279         float16 mm_16 = *(float16 *)(vm + i + idx);
2280         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2281 
2282         for (j = 0; j < 16; j += sizeof(float32)) {
2283             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx;
2284             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2285             float32 aa = *(float32 *)(va + H1_4(i + j));
2286 
2287             *(float32 *)(vd + H1_4(i + j)) =
2288                 float32_muladd(nn, mm, aa, negf, status);
2289         }
2290     }
2291 }
2292 
2293 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2294 {
2295     intptr_t i, opr_sz = simd_oprsz(desc);
2296     int8_t *d = vd, *n = vn, *m = vm;
2297 
2298     for (i = 0; i < opr_sz; ++i) {
2299         int8_t mm = m[i];
2300         int8_t nn = n[i];
2301         int8_t res = 0;
2302         if (mm >= 0) {
2303             if (mm < 8) {
2304                 res = nn << mm;
2305             }
2306         } else {
2307             res = nn >> (mm > -8 ? -mm : 7);
2308         }
2309         d[i] = res;
2310     }
2311     clear_tail(d, opr_sz, simd_maxsz(desc));
2312 }
2313 
2314 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2315 {
2316     intptr_t i, opr_sz = simd_oprsz(desc);
2317     int16_t *d = vd, *n = vn, *m = vm;
2318 
2319     for (i = 0; i < opr_sz / 2; ++i) {
2320         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2321         int16_t nn = n[i];
2322         int16_t res = 0;
2323         if (mm >= 0) {
2324             if (mm < 16) {
2325                 res = nn << mm;
2326             }
2327         } else {
2328             res = nn >> (mm > -16 ? -mm : 15);
2329         }
2330         d[i] = res;
2331     }
2332     clear_tail(d, opr_sz, simd_maxsz(desc));
2333 }
2334 
2335 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2336 {
2337     intptr_t i, opr_sz = simd_oprsz(desc);
2338     uint8_t *d = vd, *n = vn, *m = vm;
2339 
2340     for (i = 0; i < opr_sz; ++i) {
2341         int8_t mm = m[i];
2342         uint8_t nn = n[i];
2343         uint8_t res = 0;
2344         if (mm >= 0) {
2345             if (mm < 8) {
2346                 res = nn << mm;
2347             }
2348         } else {
2349             if (mm > -8) {
2350                 res = nn >> -mm;
2351             }
2352         }
2353         d[i] = res;
2354     }
2355     clear_tail(d, opr_sz, simd_maxsz(desc));
2356 }
2357 
2358 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2359 {
2360     intptr_t i, opr_sz = simd_oprsz(desc);
2361     uint16_t *d = vd, *n = vn, *m = vm;
2362 
2363     for (i = 0; i < opr_sz / 2; ++i) {
2364         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2365         uint16_t nn = n[i];
2366         uint16_t res = 0;
2367         if (mm >= 0) {
2368             if (mm < 16) {
2369                 res = nn << mm;
2370             }
2371         } else {
2372             if (mm > -16) {
2373                 res = nn >> -mm;
2374             }
2375         }
2376         d[i] = res;
2377     }
2378     clear_tail(d, opr_sz, simd_maxsz(desc));
2379 }
2380 
2381 /*
2382  * 8x8->8 polynomial multiply.
2383  *
2384  * Polynomial multiplication is like integer multiplication except the
2385  * partial products are XORed, not added.
2386  *
2387  * TODO: expose this as a generic vector operation, as it is a common
2388  * crypto building block.
2389  */
2390 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2391 {
2392     intptr_t i, opr_sz = simd_oprsz(desc);
2393     uint64_t *d = vd, *n = vn, *m = vm;
2394 
2395     for (i = 0; i < opr_sz / 8; ++i) {
2396         d[i] = clmul_8x8_low(n[i], m[i]);
2397     }
2398     clear_tail(d, opr_sz, simd_maxsz(desc));
2399 }
2400 
2401 /*
2402  * 64x64->128 polynomial multiply.
2403  * Because of the lanes are not accessed in strict columns,
2404  * this probably cannot be turned into a generic helper.
2405  */
2406 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2407 {
2408     intptr_t i, opr_sz = simd_oprsz(desc);
2409     intptr_t hi = simd_data(desc);
2410     uint64_t *d = vd, *n = vn, *m = vm;
2411 
2412     for (i = 0; i < opr_sz / 8; i += 2) {
2413         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2414         d[i] = int128_getlo(r);
2415         d[i + 1] = int128_gethi(r);
2416     }
2417     clear_tail(d, opr_sz, simd_maxsz(desc));
2418 }
2419 
2420 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2421 {
2422     int hi = simd_data(desc);
2423     uint64_t *d = vd, *n = vn, *m = vm;
2424     uint64_t nn = n[hi], mm = m[hi];
2425 
2426     d[0] = clmul_8x4_packed(nn, mm);
2427     nn >>= 32;
2428     mm >>= 32;
2429     d[1] = clmul_8x4_packed(nn, mm);
2430 
2431     clear_tail(d, 16, simd_maxsz(desc));
2432 }
2433 
2434 #ifdef TARGET_AARCH64
2435 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2436 {
2437     int shift = simd_data(desc) * 8;
2438     intptr_t i, opr_sz = simd_oprsz(desc);
2439     uint64_t *d = vd, *n = vn, *m = vm;
2440 
2441     for (i = 0; i < opr_sz / 8; ++i) {
2442         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2443     }
2444 }
2445 
2446 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2447 {
2448     intptr_t sel = H4(simd_data(desc));
2449     intptr_t i, opr_sz = simd_oprsz(desc);
2450     uint32_t *n = vn, *m = vm;
2451     uint64_t *d = vd;
2452 
2453     for (i = 0; i < opr_sz / 8; ++i) {
2454         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2455     }
2456 }
2457 #endif
2458 
2459 #define DO_CMP0(NAME, TYPE, OP)                         \
2460 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2461 {                                                       \
2462     intptr_t i, opr_sz = simd_oprsz(desc);              \
2463     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2464         TYPE nn = *(TYPE *)(vn + i);                    \
2465         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2466     }                                                   \
2467     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2468 }
2469 
2470 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2471 DO_CMP0(gvec_clt0_b, int8_t, <)
2472 DO_CMP0(gvec_cle0_b, int8_t, <=)
2473 DO_CMP0(gvec_cgt0_b, int8_t, >)
2474 DO_CMP0(gvec_cge0_b, int8_t, >=)
2475 
2476 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2477 DO_CMP0(gvec_clt0_h, int16_t, <)
2478 DO_CMP0(gvec_cle0_h, int16_t, <=)
2479 DO_CMP0(gvec_cgt0_h, int16_t, >)
2480 DO_CMP0(gvec_cge0_h, int16_t, >=)
2481 
2482 #undef DO_CMP0
2483 
2484 #define DO_ABD(NAME, TYPE)                                      \
2485 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2486 {                                                               \
2487     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2488     TYPE *d = vd, *n = vn, *m = vm;                             \
2489                                                                 \
2490     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2491         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2492     }                                                           \
2493     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2494 }
2495 
2496 DO_ABD(gvec_sabd_b, int8_t)
2497 DO_ABD(gvec_sabd_h, int16_t)
2498 DO_ABD(gvec_sabd_s, int32_t)
2499 DO_ABD(gvec_sabd_d, int64_t)
2500 
2501 DO_ABD(gvec_uabd_b, uint8_t)
2502 DO_ABD(gvec_uabd_h, uint16_t)
2503 DO_ABD(gvec_uabd_s, uint32_t)
2504 DO_ABD(gvec_uabd_d, uint64_t)
2505 
2506 #undef DO_ABD
2507 
2508 #define DO_ABA(NAME, TYPE)                                      \
2509 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2510 {                                                               \
2511     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2512     TYPE *d = vd, *n = vn, *m = vm;                             \
2513                                                                 \
2514     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2515         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2516     }                                                           \
2517     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2518 }
2519 
2520 DO_ABA(gvec_saba_b, int8_t)
2521 DO_ABA(gvec_saba_h, int16_t)
2522 DO_ABA(gvec_saba_s, int32_t)
2523 DO_ABA(gvec_saba_d, int64_t)
2524 
2525 DO_ABA(gvec_uaba_b, uint8_t)
2526 DO_ABA(gvec_uaba_h, uint16_t)
2527 DO_ABA(gvec_uaba_s, uint32_t)
2528 DO_ABA(gvec_uaba_d, uint64_t)
2529 
2530 #undef DO_ABA
2531 
2532 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2533 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2534                   float_status *stat, uint32_t desc)                       \
2535 {                                                                          \
2536     ARMVectorReg scratch;                                                  \
2537     intptr_t oprsz = simd_oprsz(desc);                                     \
2538     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2539     TYPE *d = vd, *n = vn, *m = vm;                                        \
2540     if (unlikely(d == m)) {                                                \
2541         m = memcpy(&scratch, m, oprsz);                                    \
2542     }                                                                      \
2543     for (intptr_t i = 0; i < half; ++i) {                                  \
2544         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2545     }                                                                      \
2546     for (intptr_t i = 0; i < half; ++i) {                                  \
2547         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2548     }                                                                      \
2549     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2550 }
2551 
2552 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2553 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2554 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2555 
2556 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2557 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2558 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2559 
2560 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2561 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2562 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2563 
2564 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2565 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2566 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2567 
2568 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2569 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2570 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2571 
2572 #ifdef TARGET_AARCH64
2573 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2574 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2575 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2576 
2577 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2578 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2579 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2580 #endif
2581 
2582 #undef DO_3OP_PAIR
2583 
2584 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2585 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2586 {                                                               \
2587     ARMVectorReg scratch;                                       \
2588     intptr_t oprsz = simd_oprsz(desc);                          \
2589     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2590     TYPE *d = vd, *n = vn, *m = vm;                             \
2591     if (unlikely(d == m)) {                                     \
2592         m = memcpy(&scratch, m, oprsz);                         \
2593     }                                                           \
2594     for (intptr_t i = 0; i < half; ++i) {                       \
2595         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2596     }                                                           \
2597     for (intptr_t i = 0; i < half; ++i) {                       \
2598         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2599     }                                                           \
2600     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2601 }
2602 
2603 #define ADD(A, B) (A + B)
2604 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2605 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2606 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2607 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2608 #undef  ADD
2609 
2610 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2611 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2612 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2613 
2614 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2615 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2616 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2617 
2618 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2619 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2620 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2621 
2622 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2623 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2624 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2625 
2626 #undef DO_3OP_PAIR
2627 
2628 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2629     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2630     {                                                                   \
2631         intptr_t i, oprsz = simd_oprsz(desc);                           \
2632         int shift = simd_data(desc);                                    \
2633         TYPE *d = vd, *n = vn;                                          \
2634         float_status *fpst = stat;                                      \
2635         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2636             d[i] = FUNC(n[i], shift, fpst);                             \
2637         }                                                               \
2638         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2639     }
2640 
2641 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2642 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2643 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2644 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2645 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2646 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2647 
2648 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2649 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2650 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2651 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2652 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2653 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2654 
2655 #undef DO_VCVT_FIXED
2656 
2657 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2658     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2659     {                                                                   \
2660         intptr_t i, oprsz = simd_oprsz(desc);                           \
2661         uint32_t rmode = simd_data(desc);                               \
2662         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2663         TYPE *d = vd, *n = vn;                                          \
2664         set_float_rounding_mode(rmode, fpst);                           \
2665         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2666             d[i] = FUNC(n[i], 0, fpst);                                 \
2667         }                                                               \
2668         set_float_rounding_mode(prev_rmode, fpst);                      \
2669         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2670     }
2671 
2672 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2673 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2674 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2675 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2676 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2677 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2678 
2679 #undef DO_VCVT_RMODE
2680 
2681 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2682     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2683     {                                                                   \
2684         intptr_t i, oprsz = simd_oprsz(desc);                           \
2685         uint32_t rmode = simd_data(desc);                               \
2686         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2687         TYPE *d = vd, *n = vn;                                          \
2688         set_float_rounding_mode(rmode, fpst);                           \
2689         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2690             d[i] = FUNC(n[i], fpst);                                    \
2691         }                                                               \
2692         set_float_rounding_mode(prev_rmode, fpst);                      \
2693         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2694     }
2695 
2696 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2697 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2698 
2699 #undef DO_VRINT_RMODE
2700 
2701 #ifdef TARGET_AARCH64
2702 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2703 {
2704     const uint8_t *indices = vm;
2705     size_t oprsz = simd_oprsz(desc);
2706     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2707     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2708     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2709     union {
2710         uint8_t b[16];
2711         uint64_t d[2];
2712     } result;
2713 
2714     /*
2715      * We must construct the final result in a temp, lest the output
2716      * overlaps the input table.  For TBL, begin with zero; for TBX,
2717      * begin with the original register contents.  Note that we always
2718      * copy 16 bytes here to avoid an extra branch; clearing the high
2719      * bits of the register for oprsz == 8 is handled below.
2720      */
2721     if (is_tbx) {
2722         memcpy(&result, vd, 16);
2723     } else {
2724         memset(&result, 0, 16);
2725     }
2726 
2727     for (size_t i = 0; i < oprsz; ++i) {
2728         uint32_t index = indices[H1(i)];
2729 
2730         if (index < table_len) {
2731             /*
2732              * Convert index (a byte offset into the virtual table
2733              * which is a series of 128-bit vectors concatenated)
2734              * into the correct register element, bearing in mind
2735              * that the table can wrap around from V31 to V0.
2736              */
2737             const uint8_t *table = (const uint8_t *)
2738                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2739             result.b[H1(i)] = table[H1(index % 16)];
2740         }
2741     }
2742 
2743     memcpy(vd, &result, 16);
2744     clear_tail(vd, oprsz, simd_maxsz(desc));
2745 }
2746 #endif
2747 
2748 /*
2749  * NxN -> N highpart multiply
2750  *
2751  * TODO: expose this as a generic vector operation.
2752  */
2753 
2754 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2755 {
2756     intptr_t i, opr_sz = simd_oprsz(desc);
2757     int8_t *d = vd, *n = vn, *m = vm;
2758 
2759     for (i = 0; i < opr_sz; ++i) {
2760         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2761     }
2762     clear_tail(d, opr_sz, simd_maxsz(desc));
2763 }
2764 
2765 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2766 {
2767     intptr_t i, opr_sz = simd_oprsz(desc);
2768     int16_t *d = vd, *n = vn, *m = vm;
2769 
2770     for (i = 0; i < opr_sz / 2; ++i) {
2771         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2772     }
2773     clear_tail(d, opr_sz, simd_maxsz(desc));
2774 }
2775 
2776 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2777 {
2778     intptr_t i, opr_sz = simd_oprsz(desc);
2779     int32_t *d = vd, *n = vn, *m = vm;
2780 
2781     for (i = 0; i < opr_sz / 4; ++i) {
2782         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2783     }
2784     clear_tail(d, opr_sz, simd_maxsz(desc));
2785 }
2786 
2787 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2788 {
2789     intptr_t i, opr_sz = simd_oprsz(desc);
2790     uint64_t *d = vd, *n = vn, *m = vm;
2791     uint64_t discard;
2792 
2793     for (i = 0; i < opr_sz / 8; ++i) {
2794         muls64(&discard, &d[i], n[i], m[i]);
2795     }
2796     clear_tail(d, opr_sz, simd_maxsz(desc));
2797 }
2798 
2799 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2800 {
2801     intptr_t i, opr_sz = simd_oprsz(desc);
2802     uint8_t *d = vd, *n = vn, *m = vm;
2803 
2804     for (i = 0; i < opr_sz; ++i) {
2805         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2806     }
2807     clear_tail(d, opr_sz, simd_maxsz(desc));
2808 }
2809 
2810 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2811 {
2812     intptr_t i, opr_sz = simd_oprsz(desc);
2813     uint16_t *d = vd, *n = vn, *m = vm;
2814 
2815     for (i = 0; i < opr_sz / 2; ++i) {
2816         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2817     }
2818     clear_tail(d, opr_sz, simd_maxsz(desc));
2819 }
2820 
2821 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2822 {
2823     intptr_t i, opr_sz = simd_oprsz(desc);
2824     uint32_t *d = vd, *n = vn, *m = vm;
2825 
2826     for (i = 0; i < opr_sz / 4; ++i) {
2827         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2828     }
2829     clear_tail(d, opr_sz, simd_maxsz(desc));
2830 }
2831 
2832 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2833 {
2834     intptr_t i, opr_sz = simd_oprsz(desc);
2835     uint64_t *d = vd, *n = vn, *m = vm;
2836     uint64_t discard;
2837 
2838     for (i = 0; i < opr_sz / 8; ++i) {
2839         mulu64(&discard, &d[i], n[i], m[i]);
2840     }
2841     clear_tail(d, opr_sz, simd_maxsz(desc));
2842 }
2843 
2844 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2845 {
2846     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2847     int shr = simd_data(desc);
2848     uint64_t *d = vd, *n = vn, *m = vm;
2849 
2850     for (i = 0; i < opr_sz; ++i) {
2851         d[i] = ror64(n[i] ^ m[i], shr);
2852     }
2853     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2854 }
2855 
2856 /*
2857  * Integer matrix-multiply accumulate
2858  */
2859 
2860 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2861 {
2862     int8_t *n = vn, *m = vm;
2863 
2864     for (intptr_t k = 0; k < 8; ++k) {
2865         sum += n[H1(k)] * m[H1(k)];
2866     }
2867     return sum;
2868 }
2869 
2870 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2871 {
2872     uint8_t *n = vn, *m = vm;
2873 
2874     for (intptr_t k = 0; k < 8; ++k) {
2875         sum += n[H1(k)] * m[H1(k)];
2876     }
2877     return sum;
2878 }
2879 
2880 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2881 {
2882     uint8_t *n = vn;
2883     int8_t *m = vm;
2884 
2885     for (intptr_t k = 0; k < 8; ++k) {
2886         sum += n[H1(k)] * m[H1(k)];
2887     }
2888     return sum;
2889 }
2890 
2891 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2892                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2893 {
2894     intptr_t seg, opr_sz = simd_oprsz(desc);
2895 
2896     for (seg = 0; seg < opr_sz; seg += 16) {
2897         uint32_t *d = vd + seg;
2898         uint32_t *a = va + seg;
2899         uint32_t sum0, sum1, sum2, sum3;
2900 
2901         /*
2902          * Process the entire segment at once, writing back the
2903          * results only after we've consumed all of the inputs.
2904          *
2905          * Key to indices by column:
2906          *          i   j                  i             j
2907          */
2908         sum0 = a[H4(0 + 0)];
2909         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2910         sum1 = a[H4(0 + 1)];
2911         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2912         sum2 = a[H4(2 + 0)];
2913         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2914         sum3 = a[H4(2 + 1)];
2915         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2916 
2917         d[H4(0)] = sum0;
2918         d[H4(1)] = sum1;
2919         d[H4(2)] = sum2;
2920         d[H4(3)] = sum3;
2921     }
2922     clear_tail(vd, opr_sz, simd_maxsz(desc));
2923 }
2924 
2925 #define DO_MMLA_B(NAME, INNER) \
2926     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2927     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2928 
2929 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2930 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2931 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2932 
2933 /*
2934  * BFloat16 Dot Product
2935  */
2936 
2937 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2938 {
2939     /*
2940      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2941      * For EBF = 0, we ignore the FPCR bits which determine rounding
2942      * mode and denormal-flushing, and we do unfused multiplies and
2943      * additions with intermediate rounding of all products and sums.
2944      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2945      * and we perform a fused two-way sum-of-products without intermediate
2946      * rounding of the products.
2947      * In either case, we don't set fp exception flags.
2948      *
2949      * EBF is AArch64 only, so even if it's set in the FPCR it has
2950      * no effect on AArch32 instructions.
2951      */
2952     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2953 
2954     *statusp = is_a64(env) ? env->vfp.fp_status[FPST_A64] : env->vfp.fp_status_a32;
2955     set_default_nan_mode(true, statusp);
2956 
2957     if (ebf) {
2958         /* EBF=1 needs to do a step with round-to-odd semantics */
2959         *oddstatusp = *statusp;
2960         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2961     } else {
2962         set_flush_to_zero(true, statusp);
2963         set_flush_inputs_to_zero(true, statusp);
2964         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2965     }
2966     return ebf;
2967 }
2968 
2969 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2970 {
2971     float32 t1, t2;
2972 
2973     /*
2974      * Extract each BFloat16 from the element pair, and shift
2975      * them such that they become float32.
2976      */
2977     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2978     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2979     t1 = float32_add(t1, t2, fpst);
2980     t1 = float32_add(sum, t1, fpst);
2981 
2982     return t1;
2983 }
2984 
2985 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2986                      float_status *fpst, float_status *fpst_odd)
2987 {
2988     /*
2989      * Compare f16_dotadd() in sme_helper.c, but here we have
2990      * bfloat16 inputs. In particular that means that we do not
2991      * want the FPCR.FZ16 flush semantics, so we use the normal
2992      * float_status for the input handling here.
2993      */
2994     float64 e1r = float32_to_float64(e1 << 16, fpst);
2995     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2996     float64 e2r = float32_to_float64(e2 << 16, fpst);
2997     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2998     float64 t64;
2999     float32 t32;
3000 
3001     /*
3002      * The ARM pseudocode function FPDot performs both multiplies
3003      * and the add with a single rounding operation.  Emulate this
3004      * by performing the first multiply in round-to-odd, then doing
3005      * the second multiply as fused multiply-add, and rounding to
3006      * float32 all in one step.
3007      */
3008     t64 = float64_mul(e1r, e2r, fpst_odd);
3009     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
3010 
3011     /* This conversion is exact, because we've already rounded. */
3012     t32 = float64_to_float32(t64, fpst);
3013 
3014     /* The final accumulation step is not fused. */
3015     return float32_add(sum, t32, fpst);
3016 }
3017 
3018 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
3019                         CPUARMState *env, uint32_t desc)
3020 {
3021     intptr_t i, opr_sz = simd_oprsz(desc);
3022     float32 *d = vd, *a = va;
3023     uint32_t *n = vn, *m = vm;
3024     float_status fpst, fpst_odd;
3025 
3026     if (is_ebf(env, &fpst, &fpst_odd)) {
3027         for (i = 0; i < opr_sz / 4; ++i) {
3028             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
3029         }
3030     } else {
3031         for (i = 0; i < opr_sz / 4; ++i) {
3032             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
3033         }
3034     }
3035     clear_tail(d, opr_sz, simd_maxsz(desc));
3036 }
3037 
3038 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
3039                             void *va, CPUARMState *env, uint32_t desc)
3040 {
3041     intptr_t i, j, opr_sz = simd_oprsz(desc);
3042     intptr_t index = simd_data(desc);
3043     intptr_t elements = opr_sz / 4;
3044     intptr_t eltspersegment = MIN(16 / 4, elements);
3045     float32 *d = vd, *a = va;
3046     uint32_t *n = vn, *m = vm;
3047     float_status fpst, fpst_odd;
3048 
3049     if (is_ebf(env, &fpst, &fpst_odd)) {
3050         for (i = 0; i < elements; i += eltspersegment) {
3051             uint32_t m_idx = m[i + H4(index)];
3052 
3053             for (j = i; j < i + eltspersegment; j++) {
3054                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3055             }
3056         }
3057     } else {
3058         for (i = 0; i < elements; i += eltspersegment) {
3059             uint32_t m_idx = m[i + H4(index)];
3060 
3061             for (j = i; j < i + eltspersegment; j++) {
3062                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3063             }
3064         }
3065     }
3066     clear_tail(d, opr_sz, simd_maxsz(desc));
3067 }
3068 
3069 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3070                          CPUARMState *env, uint32_t desc)
3071 {
3072     intptr_t s, opr_sz = simd_oprsz(desc);
3073     float32 *d = vd, *a = va;
3074     uint32_t *n = vn, *m = vm;
3075     float_status fpst, fpst_odd;
3076 
3077     if (is_ebf(env, &fpst, &fpst_odd)) {
3078         for (s = 0; s < opr_sz / 4; s += 4) {
3079             float32 sum00, sum01, sum10, sum11;
3080 
3081             /*
3082              * Process the entire segment at once, writing back the
3083              * results only after we've consumed all of the inputs.
3084              *
3085              * Key to indices by column:
3086              *               i   j               i   k             j   k
3087              */
3088             sum00 = a[s + H4(0 + 0)];
3089             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3090             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3091 
3092             sum01 = a[s + H4(0 + 1)];
3093             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3094             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3095 
3096             sum10 = a[s + H4(2 + 0)];
3097             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3098             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3099 
3100             sum11 = a[s + H4(2 + 1)];
3101             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3102             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3103 
3104             d[s + H4(0 + 0)] = sum00;
3105             d[s + H4(0 + 1)] = sum01;
3106             d[s + H4(2 + 0)] = sum10;
3107             d[s + H4(2 + 1)] = sum11;
3108         }
3109     } else {
3110         for (s = 0; s < opr_sz / 4; s += 4) {
3111             float32 sum00, sum01, sum10, sum11;
3112 
3113             /*
3114              * Process the entire segment at once, writing back the
3115              * results only after we've consumed all of the inputs.
3116              *
3117              * Key to indices by column:
3118              *               i   j           i   k             j   k
3119              */
3120             sum00 = a[s + H4(0 + 0)];
3121             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3122             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3123 
3124             sum01 = a[s + H4(0 + 1)];
3125             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3126             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3127 
3128             sum10 = a[s + H4(2 + 0)];
3129             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3130             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3131 
3132             sum11 = a[s + H4(2 + 1)];
3133             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3134             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3135 
3136             d[s + H4(0 + 0)] = sum00;
3137             d[s + H4(0 + 1)] = sum01;
3138             d[s + H4(2 + 0)] = sum10;
3139             d[s + H4(2 + 1)] = sum11;
3140         }
3141     }
3142     clear_tail(d, opr_sz, simd_maxsz(desc));
3143 }
3144 
3145 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3146                          float_status *stat, uint32_t desc)
3147 {
3148     intptr_t i, opr_sz = simd_oprsz(desc);
3149     intptr_t sel = simd_data(desc);
3150     float32 *d = vd, *a = va;
3151     bfloat16 *n = vn, *m = vm;
3152 
3153     for (i = 0; i < opr_sz / 4; ++i) {
3154         float32 nn = n[H2(i * 2 + sel)] << 16;
3155         float32 mm = m[H2(i * 2 + sel)] << 16;
3156         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3157     }
3158     clear_tail(d, opr_sz, simd_maxsz(desc));
3159 }
3160 
3161 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3162                              void *va, float_status *stat, uint32_t desc)
3163 {
3164     intptr_t i, j, opr_sz = simd_oprsz(desc);
3165     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3166     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3167     intptr_t elements = opr_sz / 4;
3168     intptr_t eltspersegment = MIN(16 / 4, elements);
3169     float32 *d = vd, *a = va;
3170     bfloat16 *n = vn, *m = vm;
3171 
3172     for (i = 0; i < elements; i += eltspersegment) {
3173         float32 m_idx = m[H2(2 * i + index)] << 16;
3174 
3175         for (j = i; j < i + eltspersegment; j++) {
3176             float32 n_j = n[H2(2 * j + sel)] << 16;
3177             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3178         }
3179     }
3180     clear_tail(d, opr_sz, simd_maxsz(desc));
3181 }
3182 
3183 #define DO_CLAMP(NAME, TYPE) \
3184 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3185 {                                                                       \
3186     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3187     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3188         TYPE aa = *(TYPE *)(a + i);                                     \
3189         TYPE nn = *(TYPE *)(n + i);                                     \
3190         TYPE mm = *(TYPE *)(m + i);                                     \
3191         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3192         *(TYPE *)(d + i) = dd;                                          \
3193     }                                                                   \
3194     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3195 }
3196 
3197 DO_CLAMP(gvec_sclamp_b, int8_t)
3198 DO_CLAMP(gvec_sclamp_h, int16_t)
3199 DO_CLAMP(gvec_sclamp_s, int32_t)
3200 DO_CLAMP(gvec_sclamp_d, int64_t)
3201 
3202 DO_CLAMP(gvec_uclamp_b, uint8_t)
3203 DO_CLAMP(gvec_uclamp_h, uint16_t)
3204 DO_CLAMP(gvec_uclamp_s, uint32_t)
3205 DO_CLAMP(gvec_uclamp_d, uint64_t)
3206 
3207 /* Bit count in each 8-bit word. */
3208 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3209 {
3210     intptr_t i, opr_sz = simd_oprsz(desc);
3211     uint8_t *d = vd, *n = vn;
3212 
3213     for (i = 0; i < opr_sz; ++i) {
3214         d[i] = ctpop8(n[i]);
3215     }
3216     clear_tail(d, opr_sz, simd_maxsz(desc));
3217 }
3218 
3219 /* Reverse bits in each 8 bit word */
3220 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3221 {
3222     intptr_t i, opr_sz = simd_oprsz(desc);
3223     uint64_t *d = vd, *n = vn;
3224 
3225     for (i = 0; i < opr_sz / 8; ++i) {
3226         d[i] = revbit64(bswap64(n[i]));
3227     }
3228     clear_tail(d, opr_sz, simd_maxsz(desc));
3229 }
3230 
3231 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3232 {
3233     intptr_t i, opr_sz = simd_oprsz(desc);
3234     uint32_t *d = vd, *n = vn;
3235 
3236     for (i = 0; i < opr_sz / 4; ++i) {
3237         d[i] = helper_recpe_u32(n[i]);
3238     }
3239     clear_tail(d, opr_sz, simd_maxsz(desc));
3240 }
3241 
3242 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3243 {
3244     intptr_t i, opr_sz = simd_oprsz(desc);
3245     uint32_t *d = vd, *n = vn;
3246 
3247     for (i = 0; i < opr_sz / 4; ++i) {
3248         d[i] = helper_rsqrte_u32(n[i]);
3249     }
3250     clear_tail(d, opr_sz, simd_maxsz(desc));
3251 }
3252