xref: /qemu/target/arm/tcg/vec_helper.c (revision 513823e7521a09ed7ad1e32e6454bac3b2cbf52d)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
883     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
884     uintptr_t i;
885 
886     for (i = 0; i < opr_sz / 2; i += 2) {
887         float16 e0 = n[H2(i)];
888         float16 e1 = m[H2(i + 1)];
889         float16 e2 = n[H2(i + 1)];
890         float16 e3 = m[H2(i)];
891 
892         if (rot) {
893             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
894         } else {
895             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
896         }
897 
898         d[H2(i)] = float16_add(e0, e1, fpst);
899         d[H2(i + 1)] = float16_add(e2, e3, fpst);
900     }
901     clear_tail(d, opr_sz, simd_maxsz(desc));
902 }
903 
904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
905                          float_status *fpst, uint32_t desc)
906 {
907     uintptr_t opr_sz = simd_oprsz(desc);
908     float32 *d = vd;
909     float32 *n = vn;
910     float32 *m = vm;
911     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
912     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
913     uintptr_t i;
914 
915     for (i = 0; i < opr_sz / 4; i += 2) {
916         float32 e0 = n[H4(i)];
917         float32 e1 = m[H4(i + 1)];
918         float32 e2 = n[H4(i + 1)];
919         float32 e3 = m[H4(i)];
920 
921         if (rot) {
922             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
923         } else {
924             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
925         }
926 
927         d[H4(i)] = float32_add(e0, e1, fpst);
928         d[H4(i + 1)] = float32_add(e2, e3, fpst);
929     }
930     clear_tail(d, opr_sz, simd_maxsz(desc));
931 }
932 
933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
934                          float_status *fpst, uint32_t desc)
935 {
936     uintptr_t opr_sz = simd_oprsz(desc);
937     float64 *d = vd;
938     float64 *n = vn;
939     float64 *m = vm;
940     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
941     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
942     uintptr_t i;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1];
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i];
949 
950         if (rot) {
951             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
952         } else {
953             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
954         }
955 
956         d[i] = float64_add(e0, e1, fpst);
957         d[i + 1] = float64_add(e2, e3, fpst);
958     }
959     clear_tail(d, opr_sz, simd_maxsz(desc));
960 }
961 
962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
963                          float_status *fpst, uint32_t desc)
964 {
965     uintptr_t opr_sz = simd_oprsz(desc);
966     float16 *d = vd, *n = vn, *m = vm, *a = va;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
969     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
970     uint32_t negf_real = flip ^ negf_imag;
971     float16 negx_imag, negx_real;
972     uintptr_t i;
973 
974     /* With AH=0, use negx; with AH=1 use negf. */
975     negx_real = (negf_real & ~fpcr_ah) << 15;
976     negx_imag = (negf_imag & ~fpcr_ah) << 15;
977     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
978     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
979 
980     for (i = 0; i < opr_sz / 2; i += 2) {
981         float16 e2 = n[H2(i + flip)];
982         float16 e1 = m[H2(i + flip)] ^ negx_real;
983         float16 e4 = e2;
984         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
985 
986         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
987         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
988     }
989     clear_tail(d, opr_sz, simd_maxsz(desc));
990 }
991 
992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
993                              float_status *fpst, uint32_t desc)
994 {
995     uintptr_t opr_sz = simd_oprsz(desc);
996     float16 *d = vd, *n = vn, *m = vm, *a = va;
997     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
998     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
999     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1000     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1001     uint32_t negf_real = flip ^ negf_imag;
1002     intptr_t elements = opr_sz / sizeof(float16);
1003     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1004     float16 negx_imag, negx_real;
1005     intptr_t i, j;
1006 
1007     /* With AH=0, use negx; with AH=1 use negf. */
1008     negx_real = (negf_real & ~fpcr_ah) << 15;
1009     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1010     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1011     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1012 
1013     for (i = 0; i < elements; i += eltspersegment) {
1014         float16 mr = m[H2(i + 2 * index + 0)];
1015         float16 mi = m[H2(i + 2 * index + 1)];
1016         float16 e1 = negx_real ^ (flip ? mi : mr);
1017         float16 e3 = negx_imag ^ (flip ? mr : mi);
1018 
1019         for (j = i; j < i + eltspersegment; j += 2) {
1020             float16 e2 = n[H2(j + flip)];
1021             float16 e4 = e2;
1022 
1023             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst);
1024             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst);
1025         }
1026     }
1027     clear_tail(d, opr_sz, simd_maxsz(desc));
1028 }
1029 
1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1031                          float_status *fpst, uint32_t desc)
1032 {
1033     uintptr_t opr_sz = simd_oprsz(desc);
1034     float32 *d = vd, *n = vn, *m = vm, *a = va;
1035     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1036     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1037     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1038     uint32_t negf_real = flip ^ negf_imag;
1039     float32 negx_imag, negx_real;
1040     uintptr_t i;
1041 
1042     /* With AH=0, use negx; with AH=1 use negf. */
1043     negx_real = (negf_real & ~fpcr_ah) << 31;
1044     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1045     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1046     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1047 
1048     for (i = 0; i < opr_sz / 4; i += 2) {
1049         float32 e2 = n[H4(i + flip)];
1050         float32 e1 = m[H4(i + flip)] ^ negx_real;
1051         float32 e4 = e2;
1052         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1053 
1054         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1055         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1056     }
1057     clear_tail(d, opr_sz, simd_maxsz(desc));
1058 }
1059 
1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1061                              float_status *fpst, uint32_t desc)
1062 {
1063     uintptr_t opr_sz = simd_oprsz(desc);
1064     float32 *d = vd, *n = vn, *m = vm, *a = va;
1065     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1066     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1067     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1068     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1069     uint32_t negf_real = flip ^ negf_imag;
1070     intptr_t elements = opr_sz / sizeof(float32);
1071     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1072     float32 negx_imag, negx_real;
1073     intptr_t i, j;
1074 
1075     /* With AH=0, use negx; with AH=1 use negf. */
1076     negx_real = (negf_real & ~fpcr_ah) << 31;
1077     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1078     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1079     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1080 
1081     for (i = 0; i < elements; i += eltspersegment) {
1082         float32 mr = m[H4(i + 2 * index + 0)];
1083         float32 mi = m[H4(i + 2 * index + 1)];
1084         float32 e1 = negx_real ^ (flip ? mi : mr);
1085         float32 e3 = negx_imag ^ (flip ? mr : mi);
1086 
1087         for (j = i; j < i + eltspersegment; j += 2) {
1088             float32 e2 = n[H4(j + flip)];
1089             float32 e4 = e2;
1090 
1091             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst);
1092             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst);
1093         }
1094     }
1095     clear_tail(d, opr_sz, simd_maxsz(desc));
1096 }
1097 
1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1099                          float_status *fpst, uint32_t desc)
1100 {
1101     uintptr_t opr_sz = simd_oprsz(desc);
1102     float64 *d = vd, *n = vn, *m = vm, *a = va;
1103     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1104     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1105     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1106     uint32_t negf_real = flip ^ negf_imag;
1107     float64 negx_real, negx_imag;
1108     uintptr_t i;
1109 
1110     /* With AH=0, use negx; with AH=1 use negf. */
1111     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1112     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1113     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1114     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1115 
1116     for (i = 0; i < opr_sz / 8; i += 2) {
1117         float64 e2 = n[i + flip];
1118         float64 e1 = m[i + flip] ^ negx_real;
1119         float64 e4 = e2;
1120         float64 e3 = m[i + 1 - flip] ^ negx_imag;
1121 
1122         d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1123         d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1124     }
1125     clear_tail(d, opr_sz, simd_maxsz(desc));
1126 }
1127 
1128 /*
1129  * Floating point comparisons producing an integer result (all 1s or all 0s).
1130  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1131  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1132  */
1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1134 {
1135     return -float16_eq_quiet(op1, op2, stat);
1136 }
1137 
1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1139 {
1140     return -float32_eq_quiet(op1, op2, stat);
1141 }
1142 
1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1144 {
1145     return -float64_eq_quiet(op1, op2, stat);
1146 }
1147 
1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1149 {
1150     return -float16_le(op2, op1, stat);
1151 }
1152 
1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1154 {
1155     return -float32_le(op2, op1, stat);
1156 }
1157 
1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1159 {
1160     return -float64_le(op2, op1, stat);
1161 }
1162 
1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1164 {
1165     return -float16_lt(op2, op1, stat);
1166 }
1167 
1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1169 {
1170     return -float32_lt(op2, op1, stat);
1171 }
1172 
1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1174 {
1175     return -float64_lt(op2, op1, stat);
1176 }
1177 
1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1179 {
1180     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1181 }
1182 
1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1184 {
1185     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1186 }
1187 
1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1189 {
1190     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1191 }
1192 
1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1194 {
1195     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1196 }
1197 
1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1199 {
1200     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1201 }
1202 
1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1204 {
1205     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1206 }
1207 
1208 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1209 {
1210     if (float16_is_any_nan(x)) {
1211         float_raise(float_flag_invalid, fpst);
1212         return 0;
1213     }
1214     return float16_to_int16_round_to_zero(x, fpst);
1215 }
1216 
1217 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1218 {
1219     if (float16_is_any_nan(x)) {
1220         float_raise(float_flag_invalid, fpst);
1221         return 0;
1222     }
1223     return float16_to_uint16_round_to_zero(x, fpst);
1224 }
1225 
1226 #define DO_2OP(NAME, FUNC, TYPE) \
1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1228 {                                                                 \
1229     intptr_t i, oprsz = simd_oprsz(desc);                         \
1230     TYPE *d = vd, *n = vn;                                        \
1231     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1232         d[i] = FUNC(n[i], stat);                                  \
1233     }                                                             \
1234     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1235 }
1236 
1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1239 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32)
1240 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1241 
1242 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1243 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1244 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32)
1245 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1246 
1247 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1248 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1249 
1250 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1251 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1252 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1253 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1254 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1255 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1256 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1257 DO_2OP(gvec_touszh, vfp_touszh, float16)
1258 
1259 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1260     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1261     {                                                           \
1262         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1263     }
1264 
1265 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1266     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1267     {                                                           \
1268         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1269     }
1270 
1271 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1272     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1273     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1274     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1275     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1276     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1277     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1278 
1279 DO_2OP_CMP0(cgt, cgt, FWD)
1280 DO_2OP_CMP0(cge, cge, FWD)
1281 DO_2OP_CMP0(ceq, ceq, FWD)
1282 DO_2OP_CMP0(clt, cgt, REV)
1283 DO_2OP_CMP0(cle, cge, REV)
1284 
1285 #undef DO_2OP
1286 #undef DO_2OP_CMP0
1287 
1288 /* Floating-point trigonometric starting value.
1289  * See the ARM ARM pseudocode function FPTrigSMul.
1290  */
1291 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1292 {
1293     float16 result = float16_mul(op1, op1, stat);
1294     if (!float16_is_any_nan(result)) {
1295         result = float16_set_sign(result, op2 & 1);
1296     }
1297     return result;
1298 }
1299 
1300 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1301 {
1302     float32 result = float32_mul(op1, op1, stat);
1303     if (!float32_is_any_nan(result)) {
1304         result = float32_set_sign(result, op2 & 1);
1305     }
1306     return result;
1307 }
1308 
1309 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1310 {
1311     float64 result = float64_mul(op1, op1, stat);
1312     if (!float64_is_any_nan(result)) {
1313         result = float64_set_sign(result, op2 & 1);
1314     }
1315     return result;
1316 }
1317 
1318 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1319 {
1320     return float16_abs(float16_sub(op1, op2, stat));
1321 }
1322 
1323 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1324 {
1325     return float32_abs(float32_sub(op1, op2, stat));
1326 }
1327 
1328 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1329 {
1330     return float64_abs(float64_sub(op1, op2, stat));
1331 }
1332 
1333 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1334 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1335 {
1336     float16 r = float16_sub(op1, op2, stat);
1337     return float16_is_any_nan(r) ? r : float16_abs(r);
1338 }
1339 
1340 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1341 {
1342     float32 r = float32_sub(op1, op2, stat);
1343     return float32_is_any_nan(r) ? r : float32_abs(r);
1344 }
1345 
1346 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1347 {
1348     float64 r = float64_sub(op1, op2, stat);
1349     return float64_is_any_nan(r) ? r : float64_abs(r);
1350 }
1351 
1352 /*
1353  * Reciprocal step. These are the AArch32 version which uses a
1354  * non-fused multiply-and-subtract.
1355  */
1356 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1357 {
1358     op1 = float16_squash_input_denormal(op1, stat);
1359     op2 = float16_squash_input_denormal(op2, stat);
1360 
1361     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1362         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1363         return float16_two;
1364     }
1365     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1366 }
1367 
1368 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1369 {
1370     op1 = float32_squash_input_denormal(op1, stat);
1371     op2 = float32_squash_input_denormal(op2, stat);
1372 
1373     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1374         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1375         return float32_two;
1376     }
1377     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1378 }
1379 
1380 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1381 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1382 {
1383     op1 = float16_squash_input_denormal(op1, stat);
1384     op2 = float16_squash_input_denormal(op2, stat);
1385 
1386     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1387         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1388         return float16_one_point_five;
1389     }
1390     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1391     return float16_div(op1, float16_two, stat);
1392 }
1393 
1394 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1395 {
1396     op1 = float32_squash_input_denormal(op1, stat);
1397     op2 = float32_squash_input_denormal(op2, stat);
1398 
1399     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1400         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1401         return float32_one_point_five;
1402     }
1403     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1404     return float32_div(op1, float32_two, stat);
1405 }
1406 
1407 #define DO_3OP(NAME, FUNC, TYPE) \
1408 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1409                   float_status *stat, uint32_t desc)                       \
1410 {                                                                          \
1411     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1412     TYPE *d = vd, *n = vn, *m = vm;                                        \
1413     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1414         d[i] = FUNC(n[i], m[i], stat);                                     \
1415     }                                                                      \
1416     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1417 }
1418 
1419 DO_3OP(gvec_fadd_h, float16_add, float16)
1420 DO_3OP(gvec_fadd_s, float32_add, float32)
1421 DO_3OP(gvec_fadd_d, float64_add, float64)
1422 
1423 DO_3OP(gvec_fsub_h, float16_sub, float16)
1424 DO_3OP(gvec_fsub_s, float32_sub, float32)
1425 DO_3OP(gvec_fsub_d, float64_sub, float64)
1426 
1427 DO_3OP(gvec_fmul_h, float16_mul, float16)
1428 DO_3OP(gvec_fmul_s, float32_mul, float32)
1429 DO_3OP(gvec_fmul_d, float64_mul, float64)
1430 
1431 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1432 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1433 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1434 
1435 DO_3OP(gvec_fabd_h, float16_abd, float16)
1436 DO_3OP(gvec_fabd_s, float32_abd, float32)
1437 DO_3OP(gvec_fabd_d, float64_abd, float64)
1438 
1439 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1440 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1441 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1442 
1443 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1444 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1445 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1446 
1447 DO_3OP(gvec_fcge_h, float16_cge, float16)
1448 DO_3OP(gvec_fcge_s, float32_cge, float32)
1449 DO_3OP(gvec_fcge_d, float64_cge, float64)
1450 
1451 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1452 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1453 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1454 
1455 DO_3OP(gvec_facge_h, float16_acge, float16)
1456 DO_3OP(gvec_facge_s, float32_acge, float32)
1457 DO_3OP(gvec_facge_d, float64_acge, float64)
1458 
1459 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1460 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1461 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1462 
1463 DO_3OP(gvec_fmax_h, float16_max, float16)
1464 DO_3OP(gvec_fmax_s, float32_max, float32)
1465 DO_3OP(gvec_fmax_d, float64_max, float64)
1466 
1467 DO_3OP(gvec_fmin_h, float16_min, float16)
1468 DO_3OP(gvec_fmin_s, float32_min, float32)
1469 DO_3OP(gvec_fmin_d, float64_min, float64)
1470 
1471 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1472 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1473 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1474 
1475 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1476 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1477 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1478 
1479 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1480 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1481 
1482 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1483 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1484 
1485 #ifdef TARGET_AARCH64
1486 DO_3OP(gvec_fdiv_h, float16_div, float16)
1487 DO_3OP(gvec_fdiv_s, float32_div, float32)
1488 DO_3OP(gvec_fdiv_d, float64_div, float64)
1489 
1490 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1491 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1492 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1493 
1494 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1495 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1496 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1497 
1498 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1499 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1500 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1501 
1502 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1503 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1504 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1505 
1506 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1507 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1508 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1509 
1510 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1511 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1512 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1513 
1514 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1515 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1516 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1517 
1518 #endif
1519 #undef DO_3OP
1520 
1521 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1522 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1523                                  float_status *stat)
1524 {
1525     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1526 }
1527 
1528 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1529                                  float_status *stat)
1530 {
1531     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1532 }
1533 
1534 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1535                                  float_status *stat)
1536 {
1537     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1538 }
1539 
1540 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1541                                  float_status *stat)
1542 {
1543     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1544 }
1545 
1546 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1547 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1548                                 float_status *stat)
1549 {
1550     return float16_muladd(op1, op2, dest, 0, stat);
1551 }
1552 
1553 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1554                                  float_status *stat)
1555 {
1556     return float32_muladd(op1, op2, dest, 0, stat);
1557 }
1558 
1559 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1560                                  float_status *stat)
1561 {
1562     return float64_muladd(op1, op2, dest, 0, stat);
1563 }
1564 
1565 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1566                                  float_status *stat)
1567 {
1568     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1569 }
1570 
1571 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1572                                  float_status *stat)
1573 {
1574     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1575 }
1576 
1577 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1578                                  float_status *stat)
1579 {
1580     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1581 }
1582 
1583 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1584                                  float_status *stat)
1585 {
1586     return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1587 }
1588 
1589 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1590                                  float_status *stat)
1591 {
1592     return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1593 }
1594 
1595 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1596                                  float_status *stat)
1597 {
1598     return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1599 }
1600 
1601 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1602 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1603                   float_status *stat, uint32_t desc)                       \
1604 {                                                                          \
1605     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1606     TYPE *d = vd, *n = vn, *m = vm;                                        \
1607     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1608         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1609     }                                                                      \
1610     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1611 }
1612 
1613 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1614 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1615 
1616 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1617 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1618 
1619 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1620 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1621 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1622 
1623 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1624 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1625 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1626 
1627 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1628 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1629 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1630 
1631 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1632  * For AdvSIMD, there is of course only one such vector segment.
1633  */
1634 
1635 #define DO_MUL_IDX(NAME, TYPE, H) \
1636 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1637 {                                                                          \
1638     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1639     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1640     intptr_t idx = simd_data(desc);                                        \
1641     TYPE *d = vd, *n = vn, *m = vm;                                        \
1642     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1643         TYPE mm = m[H(i + idx)];                                           \
1644         for (j = 0; j < segment; j++) {                                    \
1645             d[i + j] = n[i + j] * mm;                                      \
1646         }                                                                  \
1647     }                                                                      \
1648     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1649 }
1650 
1651 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1652 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1653 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1654 
1655 #undef DO_MUL_IDX
1656 
1657 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1658 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1659 {                                                                          \
1660     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1661     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1662     intptr_t idx = simd_data(desc);                                        \
1663     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1664     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1665         TYPE mm = m[H(i + idx)];                                           \
1666         for (j = 0; j < segment; j++) {                                    \
1667             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1668         }                                                                  \
1669     }                                                                      \
1670     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1671 }
1672 
1673 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1674 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1675 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1676 
1677 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1678 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1679 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1680 
1681 #undef DO_MLA_IDX
1682 
1683 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1684 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1685                   float_status *stat, uint32_t desc)                       \
1686 {                                                                          \
1687     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1688     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1689     intptr_t idx = simd_data(desc);                                        \
1690     TYPE *d = vd, *n = vn, *m = vm;                                        \
1691     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1692         TYPE mm = m[H(i + idx)];                                           \
1693         for (j = 0; j < segment; j++) {                                    \
1694             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1695         }                                                                  \
1696     }                                                                      \
1697     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1698 }
1699 
1700 #define nop(N, M, S) (M)
1701 
1702 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1703 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1704 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1705 
1706 #ifdef TARGET_AARCH64
1707 
1708 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1709 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1710 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1711 
1712 #endif
1713 
1714 #undef nop
1715 
1716 /*
1717  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1718  * the fused ops below they assume accumulate both from and into Vd.
1719  */
1720 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1721 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1722 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1723 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1724 
1725 #undef DO_FMUL_IDX
1726 
1727 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF)                             \
1728 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1729                   float_status *stat, uint32_t desc)                       \
1730 {                                                                          \
1731     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1732     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1733     intptr_t idx = simd_data(desc);                                        \
1734     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1735     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1736         TYPE mm = m[H(i + idx)];                                           \
1737         for (j = 0; j < segment; j++) {                                    \
1738             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1739                                      a[i + j], NEGF, stat);                \
1740         }                                                                  \
1741     }                                                                      \
1742     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1743 }
1744 
1745 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1746 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1747 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1748 
1749 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1750 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1751 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1752 
1753 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1754 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1755 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1756 
1757 #undef DO_FMLA_IDX
1758 
1759 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1760 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1761 {                                                                          \
1762     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1763     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1764     bool q = false;                                                        \
1765     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1766         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1767         if (dd < MIN) {                                                    \
1768             dd = MIN;                                                      \
1769             q = true;                                                      \
1770         } else if (dd > MAX) {                                             \
1771             dd = MAX;                                                      \
1772             q = true;                                                      \
1773         }                                                                  \
1774         d[i] = dd;                                                         \
1775     }                                                                      \
1776     if (q) {                                                               \
1777         uint32_t *qc = vq;                                                 \
1778         qc[0] = 1;                                                         \
1779     }                                                                      \
1780     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1781 }
1782 
1783 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1784 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1785 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1786 
1787 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1788 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1789 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1790 
1791 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1792 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1793 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1794 
1795 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1796 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1797 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1798 
1799 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1800 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1801 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1802 
1803 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1804 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1805 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1806 
1807 #undef DO_SAT
1808 
1809 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1810                           void *vm, uint32_t desc)
1811 {
1812     intptr_t i, oprsz = simd_oprsz(desc);
1813     uint64_t *d = vd, *n = vn, *m = vm;
1814     bool q = false;
1815 
1816     for (i = 0; i < oprsz / 8; i++) {
1817         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1818         if (dd < nn) {
1819             dd = UINT64_MAX;
1820             q = true;
1821         }
1822         d[i] = dd;
1823     }
1824     if (q) {
1825         uint32_t *qc = vq;
1826         qc[0] = 1;
1827     }
1828     clear_tail(d, oprsz, simd_maxsz(desc));
1829 }
1830 
1831 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1832                           void *vm, uint32_t desc)
1833 {
1834     intptr_t i, oprsz = simd_oprsz(desc);
1835     uint64_t *d = vd, *n = vn, *m = vm;
1836     bool q = false;
1837 
1838     for (i = 0; i < oprsz / 8; i++) {
1839         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1840         if (nn < mm) {
1841             dd = 0;
1842             q = true;
1843         }
1844         d[i] = dd;
1845     }
1846     if (q) {
1847         uint32_t *qc = vq;
1848         qc[0] = 1;
1849     }
1850     clear_tail(d, oprsz, simd_maxsz(desc));
1851 }
1852 
1853 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1854                           void *vm, uint32_t desc)
1855 {
1856     intptr_t i, oprsz = simd_oprsz(desc);
1857     int64_t *d = vd, *n = vn, *m = vm;
1858     bool q = false;
1859 
1860     for (i = 0; i < oprsz / 8; i++) {
1861         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1862         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1863             dd = (nn >> 63) ^ ~INT64_MIN;
1864             q = true;
1865         }
1866         d[i] = dd;
1867     }
1868     if (q) {
1869         uint32_t *qc = vq;
1870         qc[0] = 1;
1871     }
1872     clear_tail(d, oprsz, simd_maxsz(desc));
1873 }
1874 
1875 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1876                           void *vm, uint32_t desc)
1877 {
1878     intptr_t i, oprsz = simd_oprsz(desc);
1879     int64_t *d = vd, *n = vn, *m = vm;
1880     bool q = false;
1881 
1882     for (i = 0; i < oprsz / 8; i++) {
1883         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1884         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1885             dd = (nn >> 63) ^ ~INT64_MIN;
1886             q = true;
1887         }
1888         d[i] = dd;
1889     }
1890     if (q) {
1891         uint32_t *qc = vq;
1892         qc[0] = 1;
1893     }
1894     clear_tail(d, oprsz, simd_maxsz(desc));
1895 }
1896 
1897 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1898                            void *vm, uint32_t desc)
1899 {
1900     intptr_t i, oprsz = simd_oprsz(desc);
1901     uint64_t *d = vd, *n = vn, *m = vm;
1902     bool q = false;
1903 
1904     for (i = 0; i < oprsz / 8; i++) {
1905         uint64_t nn = n[i];
1906         int64_t mm = m[i];
1907         uint64_t dd = nn + mm;
1908 
1909         if (mm < 0) {
1910             if (nn < (uint64_t)-mm) {
1911                 dd = 0;
1912                 q = true;
1913             }
1914         } else {
1915             if (dd < nn) {
1916                 dd = UINT64_MAX;
1917                 q = true;
1918             }
1919         }
1920         d[i] = dd;
1921     }
1922     if (q) {
1923         uint32_t *qc = vq;
1924         qc[0] = 1;
1925     }
1926     clear_tail(d, oprsz, simd_maxsz(desc));
1927 }
1928 
1929 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1930                            void *vm, uint32_t desc)
1931 {
1932     intptr_t i, oprsz = simd_oprsz(desc);
1933     uint64_t *d = vd, *n = vn, *m = vm;
1934     bool q = false;
1935 
1936     for (i = 0; i < oprsz / 8; i++) {
1937         int64_t nn = n[i];
1938         uint64_t mm = m[i];
1939         int64_t dd = nn + mm;
1940 
1941         if (mm > (uint64_t)(INT64_MAX - nn)) {
1942             dd = INT64_MAX;
1943             q = true;
1944         }
1945         d[i] = dd;
1946     }
1947     if (q) {
1948         uint32_t *qc = vq;
1949         qc[0] = 1;
1950     }
1951     clear_tail(d, oprsz, simd_maxsz(desc));
1952 }
1953 
1954 #define DO_SRA(NAME, TYPE)                              \
1955 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1956 {                                                       \
1957     intptr_t i, oprsz = simd_oprsz(desc);               \
1958     int shift = simd_data(desc);                        \
1959     TYPE *d = vd, *n = vn;                              \
1960     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1961         d[i] += n[i] >> shift;                          \
1962     }                                                   \
1963     clear_tail(d, oprsz, simd_maxsz(desc));             \
1964 }
1965 
1966 DO_SRA(gvec_ssra_b, int8_t)
1967 DO_SRA(gvec_ssra_h, int16_t)
1968 DO_SRA(gvec_ssra_s, int32_t)
1969 DO_SRA(gvec_ssra_d, int64_t)
1970 
1971 DO_SRA(gvec_usra_b, uint8_t)
1972 DO_SRA(gvec_usra_h, uint16_t)
1973 DO_SRA(gvec_usra_s, uint32_t)
1974 DO_SRA(gvec_usra_d, uint64_t)
1975 
1976 #undef DO_SRA
1977 
1978 #define DO_RSHR(NAME, TYPE)                             \
1979 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1980 {                                                       \
1981     intptr_t i, oprsz = simd_oprsz(desc);               \
1982     int shift = simd_data(desc);                        \
1983     TYPE *d = vd, *n = vn;                              \
1984     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1985         TYPE tmp = n[i] >> (shift - 1);                 \
1986         d[i] = (tmp >> 1) + (tmp & 1);                  \
1987     }                                                   \
1988     clear_tail(d, oprsz, simd_maxsz(desc));             \
1989 }
1990 
1991 DO_RSHR(gvec_srshr_b, int8_t)
1992 DO_RSHR(gvec_srshr_h, int16_t)
1993 DO_RSHR(gvec_srshr_s, int32_t)
1994 DO_RSHR(gvec_srshr_d, int64_t)
1995 
1996 DO_RSHR(gvec_urshr_b, uint8_t)
1997 DO_RSHR(gvec_urshr_h, uint16_t)
1998 DO_RSHR(gvec_urshr_s, uint32_t)
1999 DO_RSHR(gvec_urshr_d, uint64_t)
2000 
2001 #undef DO_RSHR
2002 
2003 #define DO_RSRA(NAME, TYPE)                             \
2004 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2005 {                                                       \
2006     intptr_t i, oprsz = simd_oprsz(desc);               \
2007     int shift = simd_data(desc);                        \
2008     TYPE *d = vd, *n = vn;                              \
2009     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2010         TYPE tmp = n[i] >> (shift - 1);                 \
2011         d[i] += (tmp >> 1) + (tmp & 1);                 \
2012     }                                                   \
2013     clear_tail(d, oprsz, simd_maxsz(desc));             \
2014 }
2015 
2016 DO_RSRA(gvec_srsra_b, int8_t)
2017 DO_RSRA(gvec_srsra_h, int16_t)
2018 DO_RSRA(gvec_srsra_s, int32_t)
2019 DO_RSRA(gvec_srsra_d, int64_t)
2020 
2021 DO_RSRA(gvec_ursra_b, uint8_t)
2022 DO_RSRA(gvec_ursra_h, uint16_t)
2023 DO_RSRA(gvec_ursra_s, uint32_t)
2024 DO_RSRA(gvec_ursra_d, uint64_t)
2025 
2026 #undef DO_RSRA
2027 
2028 #define DO_SRI(NAME, TYPE)                              \
2029 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2030 {                                                       \
2031     intptr_t i, oprsz = simd_oprsz(desc);               \
2032     int shift = simd_data(desc);                        \
2033     TYPE *d = vd, *n = vn;                              \
2034     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2035         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2036     }                                                   \
2037     clear_tail(d, oprsz, simd_maxsz(desc));             \
2038 }
2039 
2040 DO_SRI(gvec_sri_b, uint8_t)
2041 DO_SRI(gvec_sri_h, uint16_t)
2042 DO_SRI(gvec_sri_s, uint32_t)
2043 DO_SRI(gvec_sri_d, uint64_t)
2044 
2045 #undef DO_SRI
2046 
2047 #define DO_SLI(NAME, TYPE)                              \
2048 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2049 {                                                       \
2050     intptr_t i, oprsz = simd_oprsz(desc);               \
2051     int shift = simd_data(desc);                        \
2052     TYPE *d = vd, *n = vn;                              \
2053     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2054         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2055     }                                                   \
2056     clear_tail(d, oprsz, simd_maxsz(desc));             \
2057 }
2058 
2059 DO_SLI(gvec_sli_b, uint8_t)
2060 DO_SLI(gvec_sli_h, uint16_t)
2061 DO_SLI(gvec_sli_s, uint32_t)
2062 DO_SLI(gvec_sli_d, uint64_t)
2063 
2064 #undef DO_SLI
2065 
2066 /*
2067  * Convert float16 to float32, raising no exceptions and
2068  * preserving exceptional values, including SNaN.
2069  * This is effectively an unpack+repack operation.
2070  */
2071 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2072 {
2073     const int f16_bias = 15;
2074     const int f32_bias = 127;
2075     uint32_t sign = extract32(f16, 15, 1);
2076     uint32_t exp = extract32(f16, 10, 5);
2077     uint32_t frac = extract32(f16, 0, 10);
2078 
2079     if (exp == 0x1f) {
2080         /* Inf or NaN */
2081         exp = 0xff;
2082     } else if (exp == 0) {
2083         /* Zero or denormal.  */
2084         if (frac != 0) {
2085             if (fz16) {
2086                 frac = 0;
2087             } else {
2088                 /*
2089                  * Denormal; these are all normal float32.
2090                  * Shift the fraction so that the msb is at bit 11,
2091                  * then remove bit 11 as the implicit bit of the
2092                  * normalized float32.  Note that we still go through
2093                  * the shift for normal numbers below, to put the
2094                  * float32 fraction at the right place.
2095                  */
2096                 int shift = clz32(frac) - 21;
2097                 frac = (frac << shift) & 0x3ff;
2098                 exp = f32_bias - f16_bias - shift + 1;
2099             }
2100         }
2101     } else {
2102         /* Normal number; adjust the bias.  */
2103         exp += f32_bias - f16_bias;
2104     }
2105     sign <<= 31;
2106     exp <<= 23;
2107     frac <<= 23 - 10;
2108 
2109     return sign | exp | frac;
2110 }
2111 
2112 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2113 {
2114     /*
2115      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2116      * Load the 2nd qword iff is_q & is_2.
2117      * Shift to the 2nd dword iff !is_q & is_2.
2118      * For !is_q & !is_2, the upper bits of the result are garbage.
2119      */
2120     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2121 }
2122 
2123 /*
2124  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2125  * as there is not yet SVE versions that might use blocking.
2126  */
2127 
2128 static void do_fmlal(float32 *d, void *vn, void *vm,
2129                      CPUARMState *env, uint32_t desc,
2130                      ARMFPStatusFlavour fpst_idx,
2131                      uint64_t negx, int negf)
2132 {
2133     float_status *fpst = &env->vfp.fp_status[fpst_idx];
2134     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2135     intptr_t i, oprsz = simd_oprsz(desc);
2136     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2137     int is_q = oprsz == 16;
2138     uint64_t n_4, m_4;
2139 
2140     /*
2141      * Pre-load all of the f16 data, avoiding overlap issues.
2142      * Negate all inputs for AH=0 FMLSL at once.
2143      */
2144     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2145     m_4 = load4_f16(vm, is_q, is_2);
2146 
2147     for (i = 0; i < oprsz / 4; i++) {
2148         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2149         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2150         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2151     }
2152     clear_tail(d, oprsz, simd_maxsz(desc));
2153 }
2154 
2155 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2156                             CPUARMState *env, uint32_t desc)
2157 {
2158     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2159     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2160 
2161     do_fmlal(vd, vn, vm, env, desc, FPST_STD, negx, 0);
2162 }
2163 
2164 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2165                             CPUARMState *env, uint32_t desc)
2166 {
2167     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2168     uint64_t negx = 0;
2169     int negf = 0;
2170 
2171     if (is_s) {
2172         if (env->vfp.fpcr & FPCR_AH) {
2173             negf = float_muladd_negate_product;
2174         } else {
2175             negx = 0x8000800080008000ull;
2176         }
2177     }
2178     do_fmlal(vd, vn, vm, env, desc, FPST_A64, negx, negf);
2179 }
2180 
2181 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2182                                CPUARMState *env, uint32_t desc)
2183 {
2184     intptr_t i, oprsz = simd_oprsz(desc);
2185     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2186     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2187     float_status *status = &env->vfp.fp_status[FPST_A64];
2188     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2189     int negx = 0, negf = 0;
2190 
2191     if (is_s) {
2192         if (env->vfp.fpcr & FPCR_AH) {
2193             negf = float_muladd_negate_product;
2194         } else {
2195             negx = 0x8000;
2196         }
2197     }
2198 
2199     for (i = 0; i < oprsz; i += sizeof(float32)) {
2200         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx;
2201         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2202         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2203         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2204         float32 aa = *(float32 *)(va + H1_4(i));
2205 
2206         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status);
2207     }
2208 }
2209 
2210 static void do_fmlal_idx(float32 *d, void *vn, void *vm,
2211                          CPUARMState *env, uint32_t desc,
2212                          ARMFPStatusFlavour fpst_idx,
2213                          uint64_t negx, int negf)
2214 {
2215     float_status *fpst = &env->vfp.fp_status[fpst_idx];
2216     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2217     intptr_t i, oprsz = simd_oprsz(desc);
2218     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2219     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2220     int is_q = oprsz == 16;
2221     uint64_t n_4;
2222     float32 m_1;
2223 
2224     /*
2225      * Pre-load all of the f16 data, avoiding overlap issues.
2226      * Negate all inputs for AH=0 FMLSL at once.
2227      */
2228     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2229     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2230 
2231     for (i = 0; i < oprsz / 4; i++) {
2232         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2233         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2234     }
2235     clear_tail(d, oprsz, simd_maxsz(desc));
2236 }
2237 
2238 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2239                                 CPUARMState *env, uint32_t desc)
2240 {
2241     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2242     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2243 
2244     do_fmlal_idx(vd, vn, vm, env, desc, FPST_STD, negx, 0);
2245 }
2246 
2247 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2248                                 CPUARMState *env, uint32_t desc)
2249 {
2250     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2251     uint64_t negx = 0;
2252     int negf = 0;
2253 
2254     if (is_s) {
2255         if (env->vfp.fpcr & FPCR_AH) {
2256             negf = float_muladd_negate_product;
2257         } else {
2258             negx = 0x8000800080008000ull;
2259         }
2260     }
2261     do_fmlal_idx(vd, vn, vm, env, desc, FPST_A64, negx, negf);
2262 }
2263 
2264 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2265                                CPUARMState *env, uint32_t desc)
2266 {
2267     intptr_t i, j, oprsz = simd_oprsz(desc);
2268     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2269     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2270     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2271     float_status *status = &env->vfp.fp_status[FPST_A64];
2272     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2273     int negx = 0, negf = 0;
2274 
2275     if (is_s) {
2276         if (env->vfp.fpcr & FPCR_AH) {
2277             negf = float_muladd_negate_product;
2278         } else {
2279             negx = 0x8000;
2280         }
2281     }
2282     for (i = 0; i < oprsz; i += 16) {
2283         float16 mm_16 = *(float16 *)(vm + i + idx);
2284         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2285 
2286         for (j = 0; j < 16; j += sizeof(float32)) {
2287             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx;
2288             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2289             float32 aa = *(float32 *)(va + H1_4(i + j));
2290 
2291             *(float32 *)(vd + H1_4(i + j)) =
2292                 float32_muladd(nn, mm, aa, negf, status);
2293         }
2294     }
2295 }
2296 
2297 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2298 {
2299     intptr_t i, opr_sz = simd_oprsz(desc);
2300     int8_t *d = vd, *n = vn, *m = vm;
2301 
2302     for (i = 0; i < opr_sz; ++i) {
2303         int8_t mm = m[i];
2304         int8_t nn = n[i];
2305         int8_t res = 0;
2306         if (mm >= 0) {
2307             if (mm < 8) {
2308                 res = nn << mm;
2309             }
2310         } else {
2311             res = nn >> (mm > -8 ? -mm : 7);
2312         }
2313         d[i] = res;
2314     }
2315     clear_tail(d, opr_sz, simd_maxsz(desc));
2316 }
2317 
2318 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2319 {
2320     intptr_t i, opr_sz = simd_oprsz(desc);
2321     int16_t *d = vd, *n = vn, *m = vm;
2322 
2323     for (i = 0; i < opr_sz / 2; ++i) {
2324         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2325         int16_t nn = n[i];
2326         int16_t res = 0;
2327         if (mm >= 0) {
2328             if (mm < 16) {
2329                 res = nn << mm;
2330             }
2331         } else {
2332             res = nn >> (mm > -16 ? -mm : 15);
2333         }
2334         d[i] = res;
2335     }
2336     clear_tail(d, opr_sz, simd_maxsz(desc));
2337 }
2338 
2339 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2340 {
2341     intptr_t i, opr_sz = simd_oprsz(desc);
2342     uint8_t *d = vd, *n = vn, *m = vm;
2343 
2344     for (i = 0; i < opr_sz; ++i) {
2345         int8_t mm = m[i];
2346         uint8_t nn = n[i];
2347         uint8_t res = 0;
2348         if (mm >= 0) {
2349             if (mm < 8) {
2350                 res = nn << mm;
2351             }
2352         } else {
2353             if (mm > -8) {
2354                 res = nn >> -mm;
2355             }
2356         }
2357         d[i] = res;
2358     }
2359     clear_tail(d, opr_sz, simd_maxsz(desc));
2360 }
2361 
2362 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2363 {
2364     intptr_t i, opr_sz = simd_oprsz(desc);
2365     uint16_t *d = vd, *n = vn, *m = vm;
2366 
2367     for (i = 0; i < opr_sz / 2; ++i) {
2368         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2369         uint16_t nn = n[i];
2370         uint16_t res = 0;
2371         if (mm >= 0) {
2372             if (mm < 16) {
2373                 res = nn << mm;
2374             }
2375         } else {
2376             if (mm > -16) {
2377                 res = nn >> -mm;
2378             }
2379         }
2380         d[i] = res;
2381     }
2382     clear_tail(d, opr_sz, simd_maxsz(desc));
2383 }
2384 
2385 /*
2386  * 8x8->8 polynomial multiply.
2387  *
2388  * Polynomial multiplication is like integer multiplication except the
2389  * partial products are XORed, not added.
2390  *
2391  * TODO: expose this as a generic vector operation, as it is a common
2392  * crypto building block.
2393  */
2394 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2395 {
2396     intptr_t i, opr_sz = simd_oprsz(desc);
2397     uint64_t *d = vd, *n = vn, *m = vm;
2398 
2399     for (i = 0; i < opr_sz / 8; ++i) {
2400         d[i] = clmul_8x8_low(n[i], m[i]);
2401     }
2402     clear_tail(d, opr_sz, simd_maxsz(desc));
2403 }
2404 
2405 /*
2406  * 64x64->128 polynomial multiply.
2407  * Because of the lanes are not accessed in strict columns,
2408  * this probably cannot be turned into a generic helper.
2409  */
2410 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2411 {
2412     intptr_t i, opr_sz = simd_oprsz(desc);
2413     intptr_t hi = simd_data(desc);
2414     uint64_t *d = vd, *n = vn, *m = vm;
2415 
2416     for (i = 0; i < opr_sz / 8; i += 2) {
2417         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2418         d[i] = int128_getlo(r);
2419         d[i + 1] = int128_gethi(r);
2420     }
2421     clear_tail(d, opr_sz, simd_maxsz(desc));
2422 }
2423 
2424 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2425 {
2426     int hi = simd_data(desc);
2427     uint64_t *d = vd, *n = vn, *m = vm;
2428     uint64_t nn = n[hi], mm = m[hi];
2429 
2430     d[0] = clmul_8x4_packed(nn, mm);
2431     nn >>= 32;
2432     mm >>= 32;
2433     d[1] = clmul_8x4_packed(nn, mm);
2434 
2435     clear_tail(d, 16, simd_maxsz(desc));
2436 }
2437 
2438 #ifdef TARGET_AARCH64
2439 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2440 {
2441     int shift = simd_data(desc) * 8;
2442     intptr_t i, opr_sz = simd_oprsz(desc);
2443     uint64_t *d = vd, *n = vn, *m = vm;
2444 
2445     for (i = 0; i < opr_sz / 8; ++i) {
2446         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2447     }
2448 }
2449 
2450 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2451 {
2452     intptr_t sel = H4(simd_data(desc));
2453     intptr_t i, opr_sz = simd_oprsz(desc);
2454     uint32_t *n = vn, *m = vm;
2455     uint64_t *d = vd;
2456 
2457     for (i = 0; i < opr_sz / 8; ++i) {
2458         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2459     }
2460 }
2461 #endif
2462 
2463 #define DO_CMP0(NAME, TYPE, OP)                         \
2464 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2465 {                                                       \
2466     intptr_t i, opr_sz = simd_oprsz(desc);              \
2467     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2468         TYPE nn = *(TYPE *)(vn + i);                    \
2469         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2470     }                                                   \
2471     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2472 }
2473 
2474 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2475 DO_CMP0(gvec_clt0_b, int8_t, <)
2476 DO_CMP0(gvec_cle0_b, int8_t, <=)
2477 DO_CMP0(gvec_cgt0_b, int8_t, >)
2478 DO_CMP0(gvec_cge0_b, int8_t, >=)
2479 
2480 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2481 DO_CMP0(gvec_clt0_h, int16_t, <)
2482 DO_CMP0(gvec_cle0_h, int16_t, <=)
2483 DO_CMP0(gvec_cgt0_h, int16_t, >)
2484 DO_CMP0(gvec_cge0_h, int16_t, >=)
2485 
2486 #undef DO_CMP0
2487 
2488 #define DO_ABD(NAME, TYPE)                                      \
2489 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2490 {                                                               \
2491     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2492     TYPE *d = vd, *n = vn, *m = vm;                             \
2493                                                                 \
2494     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2495         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2496     }                                                           \
2497     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2498 }
2499 
2500 DO_ABD(gvec_sabd_b, int8_t)
2501 DO_ABD(gvec_sabd_h, int16_t)
2502 DO_ABD(gvec_sabd_s, int32_t)
2503 DO_ABD(gvec_sabd_d, int64_t)
2504 
2505 DO_ABD(gvec_uabd_b, uint8_t)
2506 DO_ABD(gvec_uabd_h, uint16_t)
2507 DO_ABD(gvec_uabd_s, uint32_t)
2508 DO_ABD(gvec_uabd_d, uint64_t)
2509 
2510 #undef DO_ABD
2511 
2512 #define DO_ABA(NAME, TYPE)                                      \
2513 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2514 {                                                               \
2515     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2516     TYPE *d = vd, *n = vn, *m = vm;                             \
2517                                                                 \
2518     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2519         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2520     }                                                           \
2521     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2522 }
2523 
2524 DO_ABA(gvec_saba_b, int8_t)
2525 DO_ABA(gvec_saba_h, int16_t)
2526 DO_ABA(gvec_saba_s, int32_t)
2527 DO_ABA(gvec_saba_d, int64_t)
2528 
2529 DO_ABA(gvec_uaba_b, uint8_t)
2530 DO_ABA(gvec_uaba_h, uint16_t)
2531 DO_ABA(gvec_uaba_s, uint32_t)
2532 DO_ABA(gvec_uaba_d, uint64_t)
2533 
2534 #undef DO_ABA
2535 
2536 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2537 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2538                   float_status *stat, uint32_t desc)                       \
2539 {                                                                          \
2540     ARMVectorReg scratch;                                                  \
2541     intptr_t oprsz = simd_oprsz(desc);                                     \
2542     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2543     TYPE *d = vd, *n = vn, *m = vm;                                        \
2544     if (unlikely(d == m)) {                                                \
2545         m = memcpy(&scratch, m, oprsz);                                    \
2546     }                                                                      \
2547     for (intptr_t i = 0; i < half; ++i) {                                  \
2548         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2549     }                                                                      \
2550     for (intptr_t i = 0; i < half; ++i) {                                  \
2551         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2552     }                                                                      \
2553     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2554 }
2555 
2556 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2557 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2558 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2559 
2560 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2561 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2562 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2563 
2564 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2565 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2566 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2567 
2568 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2569 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2570 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2571 
2572 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2573 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2574 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2575 
2576 #ifdef TARGET_AARCH64
2577 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2578 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2579 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2580 
2581 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2582 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2583 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2584 #endif
2585 
2586 #undef DO_3OP_PAIR
2587 
2588 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2589 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2590 {                                                               \
2591     ARMVectorReg scratch;                                       \
2592     intptr_t oprsz = simd_oprsz(desc);                          \
2593     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2594     TYPE *d = vd, *n = vn, *m = vm;                             \
2595     if (unlikely(d == m)) {                                     \
2596         m = memcpy(&scratch, m, oprsz);                         \
2597     }                                                           \
2598     for (intptr_t i = 0; i < half; ++i) {                       \
2599         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2600     }                                                           \
2601     for (intptr_t i = 0; i < half; ++i) {                       \
2602         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2603     }                                                           \
2604     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2605 }
2606 
2607 #define ADD(A, B) (A + B)
2608 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2609 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2610 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2611 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2612 #undef  ADD
2613 
2614 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2615 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2616 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2617 
2618 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2619 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2620 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2621 
2622 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2623 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2624 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2625 
2626 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2627 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2628 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2629 
2630 #undef DO_3OP_PAIR
2631 
2632 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2633     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2634     {                                                                   \
2635         intptr_t i, oprsz = simd_oprsz(desc);                           \
2636         int shift = simd_data(desc);                                    \
2637         TYPE *d = vd, *n = vn;                                          \
2638         float_status *fpst = stat;                                      \
2639         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2640             d[i] = FUNC(n[i], shift, fpst);                             \
2641         }                                                               \
2642         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2643     }
2644 
2645 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2646 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2647 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2648 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2649 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2650 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2651 
2652 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2653 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2654 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2655 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2656 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2657 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2658 
2659 #undef DO_VCVT_FIXED
2660 
2661 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2662     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2663     {                                                                   \
2664         intptr_t i, oprsz = simd_oprsz(desc);                           \
2665         uint32_t rmode = simd_data(desc);                               \
2666         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2667         TYPE *d = vd, *n = vn;                                          \
2668         set_float_rounding_mode(rmode, fpst);                           \
2669         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2670             d[i] = FUNC(n[i], 0, fpst);                                 \
2671         }                                                               \
2672         set_float_rounding_mode(prev_rmode, fpst);                      \
2673         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2674     }
2675 
2676 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2677 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2678 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2679 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2680 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2681 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2682 
2683 #undef DO_VCVT_RMODE
2684 
2685 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2686     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2687     {                                                                   \
2688         intptr_t i, oprsz = simd_oprsz(desc);                           \
2689         uint32_t rmode = simd_data(desc);                               \
2690         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2691         TYPE *d = vd, *n = vn;                                          \
2692         set_float_rounding_mode(rmode, fpst);                           \
2693         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2694             d[i] = FUNC(n[i], fpst);                                    \
2695         }                                                               \
2696         set_float_rounding_mode(prev_rmode, fpst);                      \
2697         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2698     }
2699 
2700 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2701 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2702 
2703 #undef DO_VRINT_RMODE
2704 
2705 #ifdef TARGET_AARCH64
2706 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2707 {
2708     const uint8_t *indices = vm;
2709     size_t oprsz = simd_oprsz(desc);
2710     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2711     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2712     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2713     union {
2714         uint8_t b[16];
2715         uint64_t d[2];
2716     } result;
2717 
2718     /*
2719      * We must construct the final result in a temp, lest the output
2720      * overlaps the input table.  For TBL, begin with zero; for TBX,
2721      * begin with the original register contents.  Note that we always
2722      * copy 16 bytes here to avoid an extra branch; clearing the high
2723      * bits of the register for oprsz == 8 is handled below.
2724      */
2725     if (is_tbx) {
2726         memcpy(&result, vd, 16);
2727     } else {
2728         memset(&result, 0, 16);
2729     }
2730 
2731     for (size_t i = 0; i < oprsz; ++i) {
2732         uint32_t index = indices[H1(i)];
2733 
2734         if (index < table_len) {
2735             /*
2736              * Convert index (a byte offset into the virtual table
2737              * which is a series of 128-bit vectors concatenated)
2738              * into the correct register element, bearing in mind
2739              * that the table can wrap around from V31 to V0.
2740              */
2741             const uint8_t *table = (const uint8_t *)
2742                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2743             result.b[H1(i)] = table[H1(index % 16)];
2744         }
2745     }
2746 
2747     memcpy(vd, &result, 16);
2748     clear_tail(vd, oprsz, simd_maxsz(desc));
2749 }
2750 #endif
2751 
2752 /*
2753  * NxN -> N highpart multiply
2754  *
2755  * TODO: expose this as a generic vector operation.
2756  */
2757 
2758 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2759 {
2760     intptr_t i, opr_sz = simd_oprsz(desc);
2761     int8_t *d = vd, *n = vn, *m = vm;
2762 
2763     for (i = 0; i < opr_sz; ++i) {
2764         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2765     }
2766     clear_tail(d, opr_sz, simd_maxsz(desc));
2767 }
2768 
2769 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2770 {
2771     intptr_t i, opr_sz = simd_oprsz(desc);
2772     int16_t *d = vd, *n = vn, *m = vm;
2773 
2774     for (i = 0; i < opr_sz / 2; ++i) {
2775         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2776     }
2777     clear_tail(d, opr_sz, simd_maxsz(desc));
2778 }
2779 
2780 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2781 {
2782     intptr_t i, opr_sz = simd_oprsz(desc);
2783     int32_t *d = vd, *n = vn, *m = vm;
2784 
2785     for (i = 0; i < opr_sz / 4; ++i) {
2786         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2787     }
2788     clear_tail(d, opr_sz, simd_maxsz(desc));
2789 }
2790 
2791 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2792 {
2793     intptr_t i, opr_sz = simd_oprsz(desc);
2794     uint64_t *d = vd, *n = vn, *m = vm;
2795     uint64_t discard;
2796 
2797     for (i = 0; i < opr_sz / 8; ++i) {
2798         muls64(&discard, &d[i], n[i], m[i]);
2799     }
2800     clear_tail(d, opr_sz, simd_maxsz(desc));
2801 }
2802 
2803 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2804 {
2805     intptr_t i, opr_sz = simd_oprsz(desc);
2806     uint8_t *d = vd, *n = vn, *m = vm;
2807 
2808     for (i = 0; i < opr_sz; ++i) {
2809         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2810     }
2811     clear_tail(d, opr_sz, simd_maxsz(desc));
2812 }
2813 
2814 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2815 {
2816     intptr_t i, opr_sz = simd_oprsz(desc);
2817     uint16_t *d = vd, *n = vn, *m = vm;
2818 
2819     for (i = 0; i < opr_sz / 2; ++i) {
2820         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2821     }
2822     clear_tail(d, opr_sz, simd_maxsz(desc));
2823 }
2824 
2825 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2826 {
2827     intptr_t i, opr_sz = simd_oprsz(desc);
2828     uint32_t *d = vd, *n = vn, *m = vm;
2829 
2830     for (i = 0; i < opr_sz / 4; ++i) {
2831         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2832     }
2833     clear_tail(d, opr_sz, simd_maxsz(desc));
2834 }
2835 
2836 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2837 {
2838     intptr_t i, opr_sz = simd_oprsz(desc);
2839     uint64_t *d = vd, *n = vn, *m = vm;
2840     uint64_t discard;
2841 
2842     for (i = 0; i < opr_sz / 8; ++i) {
2843         mulu64(&discard, &d[i], n[i], m[i]);
2844     }
2845     clear_tail(d, opr_sz, simd_maxsz(desc));
2846 }
2847 
2848 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2849 {
2850     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2851     int shr = simd_data(desc);
2852     uint64_t *d = vd, *n = vn, *m = vm;
2853 
2854     for (i = 0; i < opr_sz; ++i) {
2855         d[i] = ror64(n[i] ^ m[i], shr);
2856     }
2857     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2858 }
2859 
2860 /*
2861  * Integer matrix-multiply accumulate
2862  */
2863 
2864 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2865 {
2866     int8_t *n = vn, *m = vm;
2867 
2868     for (intptr_t k = 0; k < 8; ++k) {
2869         sum += n[H1(k)] * m[H1(k)];
2870     }
2871     return sum;
2872 }
2873 
2874 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2875 {
2876     uint8_t *n = vn, *m = vm;
2877 
2878     for (intptr_t k = 0; k < 8; ++k) {
2879         sum += n[H1(k)] * m[H1(k)];
2880     }
2881     return sum;
2882 }
2883 
2884 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2885 {
2886     uint8_t *n = vn;
2887     int8_t *m = vm;
2888 
2889     for (intptr_t k = 0; k < 8; ++k) {
2890         sum += n[H1(k)] * m[H1(k)];
2891     }
2892     return sum;
2893 }
2894 
2895 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2896                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2897 {
2898     intptr_t seg, opr_sz = simd_oprsz(desc);
2899 
2900     for (seg = 0; seg < opr_sz; seg += 16) {
2901         uint32_t *d = vd + seg;
2902         uint32_t *a = va + seg;
2903         uint32_t sum0, sum1, sum2, sum3;
2904 
2905         /*
2906          * Process the entire segment at once, writing back the
2907          * results only after we've consumed all of the inputs.
2908          *
2909          * Key to indices by column:
2910          *          i   j                  i             j
2911          */
2912         sum0 = a[H4(0 + 0)];
2913         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2914         sum1 = a[H4(0 + 1)];
2915         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2916         sum2 = a[H4(2 + 0)];
2917         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2918         sum3 = a[H4(2 + 1)];
2919         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2920 
2921         d[H4(0)] = sum0;
2922         d[H4(1)] = sum1;
2923         d[H4(2)] = sum2;
2924         d[H4(3)] = sum3;
2925     }
2926     clear_tail(vd, opr_sz, simd_maxsz(desc));
2927 }
2928 
2929 #define DO_MMLA_B(NAME, INNER) \
2930     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2931     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2932 
2933 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2934 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2935 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2936 
2937 /*
2938  * BFloat16 Dot Product
2939  */
2940 
2941 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2942 {
2943     /*
2944      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2945      * For EBF = 0, we ignore the FPCR bits which determine rounding
2946      * mode and denormal-flushing, and we do unfused multiplies and
2947      * additions with intermediate rounding of all products and sums.
2948      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2949      * and we perform a fused two-way sum-of-products without intermediate
2950      * rounding of the products.
2951      * In either case, we don't set fp exception flags.
2952      *
2953      * EBF is AArch64 only, so even if it's set in the FPCR it has
2954      * no effect on AArch32 instructions.
2955      */
2956     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2957 
2958     *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32];
2959     set_default_nan_mode(true, statusp);
2960 
2961     if (ebf) {
2962         /* EBF=1 needs to do a step with round-to-odd semantics */
2963         *oddstatusp = *statusp;
2964         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2965     } else {
2966         set_flush_to_zero(true, statusp);
2967         set_flush_inputs_to_zero(true, statusp);
2968         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2969     }
2970     return ebf;
2971 }
2972 
2973 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2974 {
2975     float32 t1, t2;
2976 
2977     /*
2978      * Extract each BFloat16 from the element pair, and shift
2979      * them such that they become float32.
2980      */
2981     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2982     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2983     t1 = float32_add(t1, t2, fpst);
2984     t1 = float32_add(sum, t1, fpst);
2985 
2986     return t1;
2987 }
2988 
2989 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2990                      float_status *fpst, float_status *fpst_odd)
2991 {
2992     /*
2993      * Compare f16_dotadd() in sme_helper.c, but here we have
2994      * bfloat16 inputs. In particular that means that we do not
2995      * want the FPCR.FZ16 flush semantics, so we use the normal
2996      * float_status for the input handling here.
2997      */
2998     float64 e1r = float32_to_float64(e1 << 16, fpst);
2999     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
3000     float64 e2r = float32_to_float64(e2 << 16, fpst);
3001     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
3002     float64 t64;
3003     float32 t32;
3004 
3005     /*
3006      * The ARM pseudocode function FPDot performs both multiplies
3007      * and the add with a single rounding operation.  Emulate this
3008      * by performing the first multiply in round-to-odd, then doing
3009      * the second multiply as fused multiply-add, and rounding to
3010      * float32 all in one step.
3011      */
3012     t64 = float64_mul(e1r, e2r, fpst_odd);
3013     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
3014 
3015     /* This conversion is exact, because we've already rounded. */
3016     t32 = float64_to_float32(t64, fpst);
3017 
3018     /* The final accumulation step is not fused. */
3019     return float32_add(sum, t32, fpst);
3020 }
3021 
3022 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
3023                         CPUARMState *env, uint32_t desc)
3024 {
3025     intptr_t i, opr_sz = simd_oprsz(desc);
3026     float32 *d = vd, *a = va;
3027     uint32_t *n = vn, *m = vm;
3028     float_status fpst, fpst_odd;
3029 
3030     if (is_ebf(env, &fpst, &fpst_odd)) {
3031         for (i = 0; i < opr_sz / 4; ++i) {
3032             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
3033         }
3034     } else {
3035         for (i = 0; i < opr_sz / 4; ++i) {
3036             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
3037         }
3038     }
3039     clear_tail(d, opr_sz, simd_maxsz(desc));
3040 }
3041 
3042 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
3043                             void *va, CPUARMState *env, uint32_t desc)
3044 {
3045     intptr_t i, j, opr_sz = simd_oprsz(desc);
3046     intptr_t index = simd_data(desc);
3047     intptr_t elements = opr_sz / 4;
3048     intptr_t eltspersegment = MIN(16 / 4, elements);
3049     float32 *d = vd, *a = va;
3050     uint32_t *n = vn, *m = vm;
3051     float_status fpst, fpst_odd;
3052 
3053     if (is_ebf(env, &fpst, &fpst_odd)) {
3054         for (i = 0; i < elements; i += eltspersegment) {
3055             uint32_t m_idx = m[i + H4(index)];
3056 
3057             for (j = i; j < i + eltspersegment; j++) {
3058                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3059             }
3060         }
3061     } else {
3062         for (i = 0; i < elements; i += eltspersegment) {
3063             uint32_t m_idx = m[i + H4(index)];
3064 
3065             for (j = i; j < i + eltspersegment; j++) {
3066                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3067             }
3068         }
3069     }
3070     clear_tail(d, opr_sz, simd_maxsz(desc));
3071 }
3072 
3073 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3074                          CPUARMState *env, uint32_t desc)
3075 {
3076     intptr_t s, opr_sz = simd_oprsz(desc);
3077     float32 *d = vd, *a = va;
3078     uint32_t *n = vn, *m = vm;
3079     float_status fpst, fpst_odd;
3080 
3081     if (is_ebf(env, &fpst, &fpst_odd)) {
3082         for (s = 0; s < opr_sz / 4; s += 4) {
3083             float32 sum00, sum01, sum10, sum11;
3084 
3085             /*
3086              * Process the entire segment at once, writing back the
3087              * results only after we've consumed all of the inputs.
3088              *
3089              * Key to indices by column:
3090              *               i   j               i   k             j   k
3091              */
3092             sum00 = a[s + H4(0 + 0)];
3093             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3094             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3095 
3096             sum01 = a[s + H4(0 + 1)];
3097             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3098             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3099 
3100             sum10 = a[s + H4(2 + 0)];
3101             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3102             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3103 
3104             sum11 = a[s + H4(2 + 1)];
3105             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3106             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3107 
3108             d[s + H4(0 + 0)] = sum00;
3109             d[s + H4(0 + 1)] = sum01;
3110             d[s + H4(2 + 0)] = sum10;
3111             d[s + H4(2 + 1)] = sum11;
3112         }
3113     } else {
3114         for (s = 0; s < opr_sz / 4; s += 4) {
3115             float32 sum00, sum01, sum10, sum11;
3116 
3117             /*
3118              * Process the entire segment at once, writing back the
3119              * results only after we've consumed all of the inputs.
3120              *
3121              * Key to indices by column:
3122              *               i   j           i   k             j   k
3123              */
3124             sum00 = a[s + H4(0 + 0)];
3125             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3126             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3127 
3128             sum01 = a[s + H4(0 + 1)];
3129             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3130             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3131 
3132             sum10 = a[s + H4(2 + 0)];
3133             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3134             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3135 
3136             sum11 = a[s + H4(2 + 1)];
3137             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3138             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3139 
3140             d[s + H4(0 + 0)] = sum00;
3141             d[s + H4(0 + 1)] = sum01;
3142             d[s + H4(2 + 0)] = sum10;
3143             d[s + H4(2 + 1)] = sum11;
3144         }
3145     }
3146     clear_tail(d, opr_sz, simd_maxsz(desc));
3147 }
3148 
3149 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3150                          float_status *stat, uint32_t desc)
3151 {
3152     intptr_t i, opr_sz = simd_oprsz(desc);
3153     intptr_t sel = simd_data(desc);
3154     float32 *d = vd, *a = va;
3155     bfloat16 *n = vn, *m = vm;
3156 
3157     for (i = 0; i < opr_sz / 4; ++i) {
3158         float32 nn = n[H2(i * 2 + sel)] << 16;
3159         float32 mm = m[H2(i * 2 + sel)] << 16;
3160         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3161     }
3162     clear_tail(d, opr_sz, simd_maxsz(desc));
3163 }
3164 
3165 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3166                              void *va, float_status *stat, uint32_t desc)
3167 {
3168     intptr_t i, j, opr_sz = simd_oprsz(desc);
3169     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3170     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3171     intptr_t elements = opr_sz / 4;
3172     intptr_t eltspersegment = MIN(16 / 4, elements);
3173     float32 *d = vd, *a = va;
3174     bfloat16 *n = vn, *m = vm;
3175 
3176     for (i = 0; i < elements; i += eltspersegment) {
3177         float32 m_idx = m[H2(2 * i + index)] << 16;
3178 
3179         for (j = i; j < i + eltspersegment; j++) {
3180             float32 n_j = n[H2(2 * j + sel)] << 16;
3181             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3182         }
3183     }
3184     clear_tail(d, opr_sz, simd_maxsz(desc));
3185 }
3186 
3187 #define DO_CLAMP(NAME, TYPE) \
3188 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3189 {                                                                       \
3190     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3191     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3192         TYPE aa = *(TYPE *)(a + i);                                     \
3193         TYPE nn = *(TYPE *)(n + i);                                     \
3194         TYPE mm = *(TYPE *)(m + i);                                     \
3195         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3196         *(TYPE *)(d + i) = dd;                                          \
3197     }                                                                   \
3198     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3199 }
3200 
3201 DO_CLAMP(gvec_sclamp_b, int8_t)
3202 DO_CLAMP(gvec_sclamp_h, int16_t)
3203 DO_CLAMP(gvec_sclamp_s, int32_t)
3204 DO_CLAMP(gvec_sclamp_d, int64_t)
3205 
3206 DO_CLAMP(gvec_uclamp_b, uint8_t)
3207 DO_CLAMP(gvec_uclamp_h, uint16_t)
3208 DO_CLAMP(gvec_uclamp_s, uint32_t)
3209 DO_CLAMP(gvec_uclamp_d, uint64_t)
3210 
3211 /* Bit count in each 8-bit word. */
3212 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3213 {
3214     intptr_t i, opr_sz = simd_oprsz(desc);
3215     uint8_t *d = vd, *n = vn;
3216 
3217     for (i = 0; i < opr_sz; ++i) {
3218         d[i] = ctpop8(n[i]);
3219     }
3220     clear_tail(d, opr_sz, simd_maxsz(desc));
3221 }
3222 
3223 /* Reverse bits in each 8 bit word */
3224 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3225 {
3226     intptr_t i, opr_sz = simd_oprsz(desc);
3227     uint64_t *d = vd, *n = vn;
3228 
3229     for (i = 0; i < opr_sz / 8; ++i) {
3230         d[i] = revbit64(bswap64(n[i]));
3231     }
3232     clear_tail(d, opr_sz, simd_maxsz(desc));
3233 }
3234 
3235 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3236 {
3237     intptr_t i, opr_sz = simd_oprsz(desc);
3238     uint32_t *d = vd, *n = vn;
3239 
3240     for (i = 0; i < opr_sz / 4; ++i) {
3241         d[i] = helper_recpe_u32(n[i]);
3242     }
3243     clear_tail(d, opr_sz, simd_maxsz(desc));
3244 }
3245 
3246 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3247 {
3248     intptr_t i, opr_sz = simd_oprsz(desc);
3249     uint32_t *d = vd, *n = vn;
3250 
3251     for (i = 0; i < opr_sz / 4; ++i) {
3252         d[i] = helper_rsqrte_u32(n[i]);
3253     }
3254     clear_tail(d, opr_sz, simd_maxsz(desc));
3255 }
3256