xref: /qemu/target/arm/tcg/vec_helper.c (revision a66c4585fff70ffc4a61e0f5f5528320a55cd9cd)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
883     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
884     uintptr_t i;
885 
886     for (i = 0; i < opr_sz / 2; i += 2) {
887         float16 e0 = n[H2(i)];
888         float16 e1 = m[H2(i + 1)];
889         float16 e2 = n[H2(i + 1)];
890         float16 e3 = m[H2(i)];
891 
892         if (rot) {
893             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
894         } else {
895             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
896         }
897 
898         d[H2(i)] = float16_add(e0, e1, fpst);
899         d[H2(i + 1)] = float16_add(e2, e3, fpst);
900     }
901     clear_tail(d, opr_sz, simd_maxsz(desc));
902 }
903 
904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
905                          float_status *fpst, uint32_t desc)
906 {
907     uintptr_t opr_sz = simd_oprsz(desc);
908     float32 *d = vd;
909     float32 *n = vn;
910     float32 *m = vm;
911     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
912     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
913     uintptr_t i;
914 
915     for (i = 0; i < opr_sz / 4; i += 2) {
916         float32 e0 = n[H4(i)];
917         float32 e1 = m[H4(i + 1)];
918         float32 e2 = n[H4(i + 1)];
919         float32 e3 = m[H4(i)];
920 
921         if (rot) {
922             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
923         } else {
924             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
925         }
926 
927         d[H4(i)] = float32_add(e0, e1, fpst);
928         d[H4(i + 1)] = float32_add(e2, e3, fpst);
929     }
930     clear_tail(d, opr_sz, simd_maxsz(desc));
931 }
932 
933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
934                          float_status *fpst, uint32_t desc)
935 {
936     uintptr_t opr_sz = simd_oprsz(desc);
937     float64 *d = vd;
938     float64 *n = vn;
939     float64 *m = vm;
940     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
941     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
942     uintptr_t i;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1];
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i];
949 
950         if (rot) {
951             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
952         } else {
953             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
954         }
955 
956         d[i] = float64_add(e0, e1, fpst);
957         d[i + 1] = float64_add(e2, e3, fpst);
958     }
959     clear_tail(d, opr_sz, simd_maxsz(desc));
960 }
961 
962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
963                          float_status *fpst, uint32_t desc)
964 {
965     uintptr_t opr_sz = simd_oprsz(desc);
966     float16 *d = vd, *n = vn, *m = vm, *a = va;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
969     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
970     uint32_t negf_real = flip ^ negf_imag;
971     float16 negx_imag, negx_real;
972     uintptr_t i;
973 
974     /* With AH=0, use negx; with AH=1 use negf. */
975     negx_real = (negf_real & ~fpcr_ah) << 15;
976     negx_imag = (negf_imag & ~fpcr_ah) << 15;
977     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
978     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
979 
980     for (i = 0; i < opr_sz / 2; i += 2) {
981         float16 e2 = n[H2(i + flip)];
982         float16 e1 = m[H2(i + flip)] ^ negx_real;
983         float16 e4 = e2;
984         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
985 
986         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
987         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
988     }
989     clear_tail(d, opr_sz, simd_maxsz(desc));
990 }
991 
992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
993                              float_status *fpst, uint32_t desc)
994 {
995     uintptr_t opr_sz = simd_oprsz(desc);
996     float16 *d = vd, *n = vn, *m = vm, *a = va;
997     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
998     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
999     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1000     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1001     uint32_t negf_real = flip ^ negf_imag;
1002     intptr_t elements = opr_sz / sizeof(float16);
1003     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1004     float16 negx_imag, negx_real;
1005     intptr_t i, j;
1006 
1007     /* With AH=0, use negx; with AH=1 use negf. */
1008     negx_real = (negf_real & ~fpcr_ah) << 15;
1009     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1010     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1011     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1012 
1013     for (i = 0; i < elements; i += eltspersegment) {
1014         float16 mr = m[H2(i + 2 * index + 0)];
1015         float16 mi = m[H2(i + 2 * index + 1)];
1016         float16 e1 = negx_real ^ (flip ? mi : mr);
1017         float16 e3 = negx_imag ^ (flip ? mr : mi);
1018 
1019         for (j = i; j < i + eltspersegment; j += 2) {
1020             float16 e2 = n[H2(j + flip)];
1021             float16 e4 = e2;
1022 
1023             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst);
1024             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst);
1025         }
1026     }
1027     clear_tail(d, opr_sz, simd_maxsz(desc));
1028 }
1029 
1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1031                          float_status *fpst, uint32_t desc)
1032 {
1033     uintptr_t opr_sz = simd_oprsz(desc);
1034     float32 *d = vd, *n = vn, *m = vm, *a = va;
1035     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1036     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1037     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1038     uint32_t negf_real = flip ^ negf_imag;
1039     float32 negx_imag, negx_real;
1040     uintptr_t i;
1041 
1042     /* With AH=0, use negx; with AH=1 use negf. */
1043     negx_real = (negf_real & ~fpcr_ah) << 31;
1044     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1045     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1046     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1047 
1048     for (i = 0; i < opr_sz / 4; i += 2) {
1049         float32 e2 = n[H4(i + flip)];
1050         float32 e1 = m[H4(i + flip)] ^ negx_real;
1051         float32 e4 = e2;
1052         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1053 
1054         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1055         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1056     }
1057     clear_tail(d, opr_sz, simd_maxsz(desc));
1058 }
1059 
1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1061                              float_status *fpst, uint32_t desc)
1062 {
1063     uintptr_t opr_sz = simd_oprsz(desc);
1064     float32 *d = vd, *n = vn, *m = vm, *a = va;
1065     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1066     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1067     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1068     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1069     uint32_t negf_real = flip ^ negf_imag;
1070     intptr_t elements = opr_sz / sizeof(float32);
1071     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1072     float32 negx_imag, negx_real;
1073     intptr_t i, j;
1074 
1075     /* With AH=0, use negx; with AH=1 use negf. */
1076     negx_real = (negf_real & ~fpcr_ah) << 31;
1077     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1078     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1079     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1080 
1081     for (i = 0; i < elements; i += eltspersegment) {
1082         float32 mr = m[H4(i + 2 * index + 0)];
1083         float32 mi = m[H4(i + 2 * index + 1)];
1084         float32 e1 = negx_real ^ (flip ? mi : mr);
1085         float32 e3 = negx_imag ^ (flip ? mr : mi);
1086 
1087         for (j = i; j < i + eltspersegment; j += 2) {
1088             float32 e2 = n[H4(j + flip)];
1089             float32 e4 = e2;
1090 
1091             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst);
1092             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst);
1093         }
1094     }
1095     clear_tail(d, opr_sz, simd_maxsz(desc));
1096 }
1097 
1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1099                          float_status *fpst, uint32_t desc)
1100 {
1101     uintptr_t opr_sz = simd_oprsz(desc);
1102     float64 *d = vd, *n = vn, *m = vm, *a = va;
1103     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1104     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1105     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1106     uint32_t negf_real = flip ^ negf_imag;
1107     float64 negx_real, negx_imag;
1108     uintptr_t i;
1109 
1110     /* With AH=0, use negx; with AH=1 use negf. */
1111     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1112     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1113     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1114     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1115 
1116     for (i = 0; i < opr_sz / 8; i += 2) {
1117         float64 e2 = n[i + flip];
1118         float64 e1 = m[i + flip] ^ negx_real;
1119         float64 e4 = e2;
1120         float64 e3 = m[i + 1 - flip] ^ negx_imag;
1121 
1122         d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1123         d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1124     }
1125     clear_tail(d, opr_sz, simd_maxsz(desc));
1126 }
1127 
1128 /*
1129  * Floating point comparisons producing an integer result (all 1s or all 0s).
1130  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1131  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1132  */
1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1134 {
1135     return -float16_eq_quiet(op1, op2, stat);
1136 }
1137 
1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1139 {
1140     return -float32_eq_quiet(op1, op2, stat);
1141 }
1142 
1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1144 {
1145     return -float64_eq_quiet(op1, op2, stat);
1146 }
1147 
1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1149 {
1150     return -float16_le(op2, op1, stat);
1151 }
1152 
1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1154 {
1155     return -float32_le(op2, op1, stat);
1156 }
1157 
1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1159 {
1160     return -float64_le(op2, op1, stat);
1161 }
1162 
1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1164 {
1165     return -float16_lt(op2, op1, stat);
1166 }
1167 
1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1169 {
1170     return -float32_lt(op2, op1, stat);
1171 }
1172 
1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1174 {
1175     return -float64_lt(op2, op1, stat);
1176 }
1177 
1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1179 {
1180     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1181 }
1182 
1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1184 {
1185     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1186 }
1187 
1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1189 {
1190     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1191 }
1192 
1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1194 {
1195     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1196 }
1197 
1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1199 {
1200     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1201 }
1202 
1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1204 {
1205     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1206 }
1207 
1208 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1209 {
1210     if (float16_is_any_nan(x)) {
1211         float_raise(float_flag_invalid, fpst);
1212         return 0;
1213     }
1214     return float16_to_int16_round_to_zero(x, fpst);
1215 }
1216 
1217 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1218 {
1219     if (float16_is_any_nan(x)) {
1220         float_raise(float_flag_invalid, fpst);
1221         return 0;
1222     }
1223     return float16_to_uint16_round_to_zero(x, fpst);
1224 }
1225 
1226 #define DO_2OP(NAME, FUNC, TYPE) \
1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1228 {                                                                 \
1229     intptr_t i, oprsz = simd_oprsz(desc);                         \
1230     TYPE *d = vd, *n = vn;                                        \
1231     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1232         d[i] = FUNC(n[i], stat);                                  \
1233     }                                                             \
1234     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1235 }
1236 
1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1239 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1240 
1241 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1242 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1243 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1244 
1245 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1246 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1247 
1248 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1249 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1250 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1251 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1252 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1253 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1254 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1255 DO_2OP(gvec_touszh, vfp_touszh, float16)
1256 
1257 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1258     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1259     {                                                           \
1260         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1261     }
1262 
1263 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1264     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1265     {                                                           \
1266         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1267     }
1268 
1269 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1270     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1271     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1272     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1273     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1274     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1275     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1276 
1277 DO_2OP_CMP0(cgt, cgt, FWD)
1278 DO_2OP_CMP0(cge, cge, FWD)
1279 DO_2OP_CMP0(ceq, ceq, FWD)
1280 DO_2OP_CMP0(clt, cgt, REV)
1281 DO_2OP_CMP0(cle, cge, REV)
1282 
1283 #undef DO_2OP
1284 #undef DO_2OP_CMP0
1285 
1286 /* Floating-point trigonometric starting value.
1287  * See the ARM ARM pseudocode function FPTrigSMul.
1288  */
1289 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1290 {
1291     float16 result = float16_mul(op1, op1, stat);
1292     if (!float16_is_any_nan(result)) {
1293         result = float16_set_sign(result, op2 & 1);
1294     }
1295     return result;
1296 }
1297 
1298 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1299 {
1300     float32 result = float32_mul(op1, op1, stat);
1301     if (!float32_is_any_nan(result)) {
1302         result = float32_set_sign(result, op2 & 1);
1303     }
1304     return result;
1305 }
1306 
1307 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1308 {
1309     float64 result = float64_mul(op1, op1, stat);
1310     if (!float64_is_any_nan(result)) {
1311         result = float64_set_sign(result, op2 & 1);
1312     }
1313     return result;
1314 }
1315 
1316 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1317 {
1318     return float16_abs(float16_sub(op1, op2, stat));
1319 }
1320 
1321 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1322 {
1323     return float32_abs(float32_sub(op1, op2, stat));
1324 }
1325 
1326 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1327 {
1328     return float64_abs(float64_sub(op1, op2, stat));
1329 }
1330 
1331 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1332 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1333 {
1334     float16 r = float16_sub(op1, op2, stat);
1335     return float16_is_any_nan(r) ? r : float16_abs(r);
1336 }
1337 
1338 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1339 {
1340     float32 r = float32_sub(op1, op2, stat);
1341     return float32_is_any_nan(r) ? r : float32_abs(r);
1342 }
1343 
1344 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1345 {
1346     float64 r = float64_sub(op1, op2, stat);
1347     return float64_is_any_nan(r) ? r : float64_abs(r);
1348 }
1349 
1350 /*
1351  * Reciprocal step. These are the AArch32 version which uses a
1352  * non-fused multiply-and-subtract.
1353  */
1354 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1355 {
1356     op1 = float16_squash_input_denormal(op1, stat);
1357     op2 = float16_squash_input_denormal(op2, stat);
1358 
1359     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1360         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1361         return float16_two;
1362     }
1363     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1364 }
1365 
1366 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1367 {
1368     op1 = float32_squash_input_denormal(op1, stat);
1369     op2 = float32_squash_input_denormal(op2, stat);
1370 
1371     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1372         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1373         return float32_two;
1374     }
1375     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1376 }
1377 
1378 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1379 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1380 {
1381     op1 = float16_squash_input_denormal(op1, stat);
1382     op2 = float16_squash_input_denormal(op2, stat);
1383 
1384     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1385         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1386         return float16_one_point_five;
1387     }
1388     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1389     return float16_div(op1, float16_two, stat);
1390 }
1391 
1392 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1393 {
1394     op1 = float32_squash_input_denormal(op1, stat);
1395     op2 = float32_squash_input_denormal(op2, stat);
1396 
1397     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1398         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1399         return float32_one_point_five;
1400     }
1401     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1402     return float32_div(op1, float32_two, stat);
1403 }
1404 
1405 #define DO_3OP(NAME, FUNC, TYPE) \
1406 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1407                   float_status *stat, uint32_t desc)                       \
1408 {                                                                          \
1409     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1410     TYPE *d = vd, *n = vn, *m = vm;                                        \
1411     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1412         d[i] = FUNC(n[i], m[i], stat);                                     \
1413     }                                                                      \
1414     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1415 }
1416 
1417 DO_3OP(gvec_fadd_h, float16_add, float16)
1418 DO_3OP(gvec_fadd_s, float32_add, float32)
1419 DO_3OP(gvec_fadd_d, float64_add, float64)
1420 
1421 DO_3OP(gvec_fsub_h, float16_sub, float16)
1422 DO_3OP(gvec_fsub_s, float32_sub, float32)
1423 DO_3OP(gvec_fsub_d, float64_sub, float64)
1424 
1425 DO_3OP(gvec_fmul_h, float16_mul, float16)
1426 DO_3OP(gvec_fmul_s, float32_mul, float32)
1427 DO_3OP(gvec_fmul_d, float64_mul, float64)
1428 
1429 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1430 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1431 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1432 
1433 DO_3OP(gvec_fabd_h, float16_abd, float16)
1434 DO_3OP(gvec_fabd_s, float32_abd, float32)
1435 DO_3OP(gvec_fabd_d, float64_abd, float64)
1436 
1437 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1438 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1439 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1440 
1441 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1442 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1443 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1444 
1445 DO_3OP(gvec_fcge_h, float16_cge, float16)
1446 DO_3OP(gvec_fcge_s, float32_cge, float32)
1447 DO_3OP(gvec_fcge_d, float64_cge, float64)
1448 
1449 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1450 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1451 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1452 
1453 DO_3OP(gvec_facge_h, float16_acge, float16)
1454 DO_3OP(gvec_facge_s, float32_acge, float32)
1455 DO_3OP(gvec_facge_d, float64_acge, float64)
1456 
1457 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1458 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1459 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1460 
1461 DO_3OP(gvec_fmax_h, float16_max, float16)
1462 DO_3OP(gvec_fmax_s, float32_max, float32)
1463 DO_3OP(gvec_fmax_d, float64_max, float64)
1464 
1465 DO_3OP(gvec_fmin_h, float16_min, float16)
1466 DO_3OP(gvec_fmin_s, float32_min, float32)
1467 DO_3OP(gvec_fmin_d, float64_min, float64)
1468 
1469 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1470 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1471 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1472 
1473 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1474 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1475 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1476 
1477 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1478 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1479 
1480 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1481 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1482 
1483 #ifdef TARGET_AARCH64
1484 DO_3OP(gvec_fdiv_h, float16_div, float16)
1485 DO_3OP(gvec_fdiv_s, float32_div, float32)
1486 DO_3OP(gvec_fdiv_d, float64_div, float64)
1487 
1488 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1489 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1490 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1491 
1492 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1493 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1494 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1495 
1496 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1497 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1498 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1499 
1500 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1501 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1502 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1503 
1504 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1505 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1506 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1507 
1508 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1509 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1510 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1511 
1512 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1513 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1514 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1515 
1516 #endif
1517 #undef DO_3OP
1518 
1519 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1520 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1521                                  float_status *stat)
1522 {
1523     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1524 }
1525 
1526 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1527                                  float_status *stat)
1528 {
1529     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1530 }
1531 
1532 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1533                                  float_status *stat)
1534 {
1535     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1536 }
1537 
1538 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1539                                  float_status *stat)
1540 {
1541     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1542 }
1543 
1544 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1545 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1546                                 float_status *stat)
1547 {
1548     return float16_muladd(op1, op2, dest, 0, stat);
1549 }
1550 
1551 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1552                                  float_status *stat)
1553 {
1554     return float32_muladd(op1, op2, dest, 0, stat);
1555 }
1556 
1557 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1558                                  float_status *stat)
1559 {
1560     return float64_muladd(op1, op2, dest, 0, stat);
1561 }
1562 
1563 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1564                                  float_status *stat)
1565 {
1566     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1567 }
1568 
1569 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1570                                  float_status *stat)
1571 {
1572     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1573 }
1574 
1575 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1576                                  float_status *stat)
1577 {
1578     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1579 }
1580 
1581 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1582                                  float_status *stat)
1583 {
1584     return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1585 }
1586 
1587 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1588                                  float_status *stat)
1589 {
1590     return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1591 }
1592 
1593 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1594                                  float_status *stat)
1595 {
1596     return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1597 }
1598 
1599 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1600 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1601                   float_status *stat, uint32_t desc)                       \
1602 {                                                                          \
1603     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1604     TYPE *d = vd, *n = vn, *m = vm;                                        \
1605     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1606         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1607     }                                                                      \
1608     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1609 }
1610 
1611 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1612 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1613 
1614 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1615 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1616 
1617 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1618 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1619 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1620 
1621 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1622 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1623 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1624 
1625 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1626 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1627 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1628 
1629 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1630  * For AdvSIMD, there is of course only one such vector segment.
1631  */
1632 
1633 #define DO_MUL_IDX(NAME, TYPE, H) \
1634 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1635 {                                                                          \
1636     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1637     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1638     intptr_t idx = simd_data(desc);                                        \
1639     TYPE *d = vd, *n = vn, *m = vm;                                        \
1640     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1641         TYPE mm = m[H(i + idx)];                                           \
1642         for (j = 0; j < segment; j++) {                                    \
1643             d[i + j] = n[i + j] * mm;                                      \
1644         }                                                                  \
1645     }                                                                      \
1646     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1647 }
1648 
1649 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1650 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1651 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1652 
1653 #undef DO_MUL_IDX
1654 
1655 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1656 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1657 {                                                                          \
1658     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1659     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1660     intptr_t idx = simd_data(desc);                                        \
1661     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1662     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1663         TYPE mm = m[H(i + idx)];                                           \
1664         for (j = 0; j < segment; j++) {                                    \
1665             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1666         }                                                                  \
1667     }                                                                      \
1668     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1669 }
1670 
1671 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1672 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1673 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1674 
1675 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1676 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1677 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1678 
1679 #undef DO_MLA_IDX
1680 
1681 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1682 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1683                   float_status *stat, uint32_t desc)                       \
1684 {                                                                          \
1685     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1686     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1687     intptr_t idx = simd_data(desc);                                        \
1688     TYPE *d = vd, *n = vn, *m = vm;                                        \
1689     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1690         TYPE mm = m[H(i + idx)];                                           \
1691         for (j = 0; j < segment; j++) {                                    \
1692             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1693         }                                                                  \
1694     }                                                                      \
1695     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1696 }
1697 
1698 #define nop(N, M, S) (M)
1699 
1700 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1701 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1702 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1703 
1704 #ifdef TARGET_AARCH64
1705 
1706 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1707 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1708 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1709 
1710 #endif
1711 
1712 #undef nop
1713 
1714 /*
1715  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1716  * the fused ops below they assume accumulate both from and into Vd.
1717  */
1718 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1719 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1720 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1721 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1722 
1723 #undef DO_FMUL_IDX
1724 
1725 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF)                             \
1726 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1727                   float_status *stat, uint32_t desc)                       \
1728 {                                                                          \
1729     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1730     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1731     intptr_t idx = simd_data(desc);                                        \
1732     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1733     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1734         TYPE mm = m[H(i + idx)];                                           \
1735         for (j = 0; j < segment; j++) {                                    \
1736             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1737                                      a[i + j], NEGF, stat);                \
1738         }                                                                  \
1739     }                                                                      \
1740     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1741 }
1742 
1743 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1744 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1745 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1746 
1747 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1748 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1749 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1750 
1751 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1752 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1753 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1754 
1755 #undef DO_FMLA_IDX
1756 
1757 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1758 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1759 {                                                                          \
1760     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1761     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1762     bool q = false;                                                        \
1763     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1764         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1765         if (dd < MIN) {                                                    \
1766             dd = MIN;                                                      \
1767             q = true;                                                      \
1768         } else if (dd > MAX) {                                             \
1769             dd = MAX;                                                      \
1770             q = true;                                                      \
1771         }                                                                  \
1772         d[i] = dd;                                                         \
1773     }                                                                      \
1774     if (q) {                                                               \
1775         uint32_t *qc = vq;                                                 \
1776         qc[0] = 1;                                                         \
1777     }                                                                      \
1778     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1779 }
1780 
1781 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1782 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1783 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1784 
1785 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1786 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1787 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1788 
1789 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1790 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1791 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1792 
1793 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1794 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1795 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1796 
1797 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1798 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1799 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1800 
1801 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1802 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1803 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1804 
1805 #undef DO_SAT
1806 
1807 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1808                           void *vm, uint32_t desc)
1809 {
1810     intptr_t i, oprsz = simd_oprsz(desc);
1811     uint64_t *d = vd, *n = vn, *m = vm;
1812     bool q = false;
1813 
1814     for (i = 0; i < oprsz / 8; i++) {
1815         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1816         if (dd < nn) {
1817             dd = UINT64_MAX;
1818             q = true;
1819         }
1820         d[i] = dd;
1821     }
1822     if (q) {
1823         uint32_t *qc = vq;
1824         qc[0] = 1;
1825     }
1826     clear_tail(d, oprsz, simd_maxsz(desc));
1827 }
1828 
1829 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1830                           void *vm, uint32_t desc)
1831 {
1832     intptr_t i, oprsz = simd_oprsz(desc);
1833     uint64_t *d = vd, *n = vn, *m = vm;
1834     bool q = false;
1835 
1836     for (i = 0; i < oprsz / 8; i++) {
1837         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1838         if (nn < mm) {
1839             dd = 0;
1840             q = true;
1841         }
1842         d[i] = dd;
1843     }
1844     if (q) {
1845         uint32_t *qc = vq;
1846         qc[0] = 1;
1847     }
1848     clear_tail(d, oprsz, simd_maxsz(desc));
1849 }
1850 
1851 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1852                           void *vm, uint32_t desc)
1853 {
1854     intptr_t i, oprsz = simd_oprsz(desc);
1855     int64_t *d = vd, *n = vn, *m = vm;
1856     bool q = false;
1857 
1858     for (i = 0; i < oprsz / 8; i++) {
1859         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1860         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1861             dd = (nn >> 63) ^ ~INT64_MIN;
1862             q = true;
1863         }
1864         d[i] = dd;
1865     }
1866     if (q) {
1867         uint32_t *qc = vq;
1868         qc[0] = 1;
1869     }
1870     clear_tail(d, oprsz, simd_maxsz(desc));
1871 }
1872 
1873 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1874                           void *vm, uint32_t desc)
1875 {
1876     intptr_t i, oprsz = simd_oprsz(desc);
1877     int64_t *d = vd, *n = vn, *m = vm;
1878     bool q = false;
1879 
1880     for (i = 0; i < oprsz / 8; i++) {
1881         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1882         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1883             dd = (nn >> 63) ^ ~INT64_MIN;
1884             q = true;
1885         }
1886         d[i] = dd;
1887     }
1888     if (q) {
1889         uint32_t *qc = vq;
1890         qc[0] = 1;
1891     }
1892     clear_tail(d, oprsz, simd_maxsz(desc));
1893 }
1894 
1895 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1896                            void *vm, uint32_t desc)
1897 {
1898     intptr_t i, oprsz = simd_oprsz(desc);
1899     uint64_t *d = vd, *n = vn, *m = vm;
1900     bool q = false;
1901 
1902     for (i = 0; i < oprsz / 8; i++) {
1903         uint64_t nn = n[i];
1904         int64_t mm = m[i];
1905         uint64_t dd = nn + mm;
1906 
1907         if (mm < 0) {
1908             if (nn < (uint64_t)-mm) {
1909                 dd = 0;
1910                 q = true;
1911             }
1912         } else {
1913             if (dd < nn) {
1914                 dd = UINT64_MAX;
1915                 q = true;
1916             }
1917         }
1918         d[i] = dd;
1919     }
1920     if (q) {
1921         uint32_t *qc = vq;
1922         qc[0] = 1;
1923     }
1924     clear_tail(d, oprsz, simd_maxsz(desc));
1925 }
1926 
1927 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1928                            void *vm, uint32_t desc)
1929 {
1930     intptr_t i, oprsz = simd_oprsz(desc);
1931     uint64_t *d = vd, *n = vn, *m = vm;
1932     bool q = false;
1933 
1934     for (i = 0; i < oprsz / 8; i++) {
1935         int64_t nn = n[i];
1936         uint64_t mm = m[i];
1937         int64_t dd = nn + mm;
1938 
1939         if (mm > (uint64_t)(INT64_MAX - nn)) {
1940             dd = INT64_MAX;
1941             q = true;
1942         }
1943         d[i] = dd;
1944     }
1945     if (q) {
1946         uint32_t *qc = vq;
1947         qc[0] = 1;
1948     }
1949     clear_tail(d, oprsz, simd_maxsz(desc));
1950 }
1951 
1952 #define DO_SRA(NAME, TYPE)                              \
1953 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1954 {                                                       \
1955     intptr_t i, oprsz = simd_oprsz(desc);               \
1956     int shift = simd_data(desc);                        \
1957     TYPE *d = vd, *n = vn;                              \
1958     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1959         d[i] += n[i] >> shift;                          \
1960     }                                                   \
1961     clear_tail(d, oprsz, simd_maxsz(desc));             \
1962 }
1963 
1964 DO_SRA(gvec_ssra_b, int8_t)
1965 DO_SRA(gvec_ssra_h, int16_t)
1966 DO_SRA(gvec_ssra_s, int32_t)
1967 DO_SRA(gvec_ssra_d, int64_t)
1968 
1969 DO_SRA(gvec_usra_b, uint8_t)
1970 DO_SRA(gvec_usra_h, uint16_t)
1971 DO_SRA(gvec_usra_s, uint32_t)
1972 DO_SRA(gvec_usra_d, uint64_t)
1973 
1974 #undef DO_SRA
1975 
1976 #define DO_RSHR(NAME, TYPE)                             \
1977 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1978 {                                                       \
1979     intptr_t i, oprsz = simd_oprsz(desc);               \
1980     int shift = simd_data(desc);                        \
1981     TYPE *d = vd, *n = vn;                              \
1982     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1983         TYPE tmp = n[i] >> (shift - 1);                 \
1984         d[i] = (tmp >> 1) + (tmp & 1);                  \
1985     }                                                   \
1986     clear_tail(d, oprsz, simd_maxsz(desc));             \
1987 }
1988 
1989 DO_RSHR(gvec_srshr_b, int8_t)
1990 DO_RSHR(gvec_srshr_h, int16_t)
1991 DO_RSHR(gvec_srshr_s, int32_t)
1992 DO_RSHR(gvec_srshr_d, int64_t)
1993 
1994 DO_RSHR(gvec_urshr_b, uint8_t)
1995 DO_RSHR(gvec_urshr_h, uint16_t)
1996 DO_RSHR(gvec_urshr_s, uint32_t)
1997 DO_RSHR(gvec_urshr_d, uint64_t)
1998 
1999 #undef DO_RSHR
2000 
2001 #define DO_RSRA(NAME, TYPE)                             \
2002 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2003 {                                                       \
2004     intptr_t i, oprsz = simd_oprsz(desc);               \
2005     int shift = simd_data(desc);                        \
2006     TYPE *d = vd, *n = vn;                              \
2007     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2008         TYPE tmp = n[i] >> (shift - 1);                 \
2009         d[i] += (tmp >> 1) + (tmp & 1);                 \
2010     }                                                   \
2011     clear_tail(d, oprsz, simd_maxsz(desc));             \
2012 }
2013 
2014 DO_RSRA(gvec_srsra_b, int8_t)
2015 DO_RSRA(gvec_srsra_h, int16_t)
2016 DO_RSRA(gvec_srsra_s, int32_t)
2017 DO_RSRA(gvec_srsra_d, int64_t)
2018 
2019 DO_RSRA(gvec_ursra_b, uint8_t)
2020 DO_RSRA(gvec_ursra_h, uint16_t)
2021 DO_RSRA(gvec_ursra_s, uint32_t)
2022 DO_RSRA(gvec_ursra_d, uint64_t)
2023 
2024 #undef DO_RSRA
2025 
2026 #define DO_SRI(NAME, TYPE)                              \
2027 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2028 {                                                       \
2029     intptr_t i, oprsz = simd_oprsz(desc);               \
2030     int shift = simd_data(desc);                        \
2031     TYPE *d = vd, *n = vn;                              \
2032     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2033         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2034     }                                                   \
2035     clear_tail(d, oprsz, simd_maxsz(desc));             \
2036 }
2037 
2038 DO_SRI(gvec_sri_b, uint8_t)
2039 DO_SRI(gvec_sri_h, uint16_t)
2040 DO_SRI(gvec_sri_s, uint32_t)
2041 DO_SRI(gvec_sri_d, uint64_t)
2042 
2043 #undef DO_SRI
2044 
2045 #define DO_SLI(NAME, TYPE)                              \
2046 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2047 {                                                       \
2048     intptr_t i, oprsz = simd_oprsz(desc);               \
2049     int shift = simd_data(desc);                        \
2050     TYPE *d = vd, *n = vn;                              \
2051     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2052         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2053     }                                                   \
2054     clear_tail(d, oprsz, simd_maxsz(desc));             \
2055 }
2056 
2057 DO_SLI(gvec_sli_b, uint8_t)
2058 DO_SLI(gvec_sli_h, uint16_t)
2059 DO_SLI(gvec_sli_s, uint32_t)
2060 DO_SLI(gvec_sli_d, uint64_t)
2061 
2062 #undef DO_SLI
2063 
2064 /*
2065  * Convert float16 to float32, raising no exceptions and
2066  * preserving exceptional values, including SNaN.
2067  * This is effectively an unpack+repack operation.
2068  */
2069 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2070 {
2071     const int f16_bias = 15;
2072     const int f32_bias = 127;
2073     uint32_t sign = extract32(f16, 15, 1);
2074     uint32_t exp = extract32(f16, 10, 5);
2075     uint32_t frac = extract32(f16, 0, 10);
2076 
2077     if (exp == 0x1f) {
2078         /* Inf or NaN */
2079         exp = 0xff;
2080     } else if (exp == 0) {
2081         /* Zero or denormal.  */
2082         if (frac != 0) {
2083             if (fz16) {
2084                 frac = 0;
2085             } else {
2086                 /*
2087                  * Denormal; these are all normal float32.
2088                  * Shift the fraction so that the msb is at bit 11,
2089                  * then remove bit 11 as the implicit bit of the
2090                  * normalized float32.  Note that we still go through
2091                  * the shift for normal numbers below, to put the
2092                  * float32 fraction at the right place.
2093                  */
2094                 int shift = clz32(frac) - 21;
2095                 frac = (frac << shift) & 0x3ff;
2096                 exp = f32_bias - f16_bias - shift + 1;
2097             }
2098         }
2099     } else {
2100         /* Normal number; adjust the bias.  */
2101         exp += f32_bias - f16_bias;
2102     }
2103     sign <<= 31;
2104     exp <<= 23;
2105     frac <<= 23 - 10;
2106 
2107     return sign | exp | frac;
2108 }
2109 
2110 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2111 {
2112     /*
2113      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2114      * Load the 2nd qword iff is_q & is_2.
2115      * Shift to the 2nd dword iff !is_q & is_2.
2116      * For !is_q & !is_2, the upper bits of the result are garbage.
2117      */
2118     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2119 }
2120 
2121 /*
2122  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2123  * as there is not yet SVE versions that might use blocking.
2124  */
2125 
2126 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2127                      uint64_t negx, int negf, uint32_t desc, bool fz16)
2128 {
2129     intptr_t i, oprsz = simd_oprsz(desc);
2130     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2131     int is_q = oprsz == 16;
2132     uint64_t n_4, m_4;
2133 
2134     /*
2135      * Pre-load all of the f16 data, avoiding overlap issues.
2136      * Negate all inputs for AH=0 FMLSL at once.
2137      */
2138     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2139     m_4 = load4_f16(vm, is_q, is_2);
2140 
2141     for (i = 0; i < oprsz / 4; i++) {
2142         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2143         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2144         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2145     }
2146     clear_tail(d, oprsz, simd_maxsz(desc));
2147 }
2148 
2149 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2150                             CPUARMState *env, uint32_t desc)
2151 {
2152     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2153     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2154 
2155     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, negx, 0, desc,
2156              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2157 }
2158 
2159 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2160                             CPUARMState *env, uint32_t desc)
2161 {
2162     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2163     uint64_t negx = 0;
2164     int negf = 0;
2165 
2166     if (is_s) {
2167         if (env->vfp.fpcr & FPCR_AH) {
2168             negf = float_muladd_negate_product;
2169         } else {
2170             negx = 0x8000800080008000ull;
2171         }
2172     }
2173     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc,
2174              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2175 }
2176 
2177 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2178                                CPUARMState *env, uint32_t desc)
2179 {
2180     intptr_t i, oprsz = simd_oprsz(desc);
2181     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2182     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2183     float_status *status = &env->vfp.fp_status_a64;
2184     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2185     int negx = 0, negf = 0;
2186 
2187     if (is_s) {
2188         if (env->vfp.fpcr & FPCR_AH) {
2189             negf = float_muladd_negate_product;
2190         } else {
2191             negx = 0x8000;
2192         }
2193     }
2194 
2195     for (i = 0; i < oprsz; i += sizeof(float32)) {
2196         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx;
2197         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2198         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2199         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2200         float32 aa = *(float32 *)(va + H1_4(i));
2201 
2202         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status);
2203     }
2204 }
2205 
2206 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2207                          uint64_t negx, int negf, uint32_t desc, bool fz16)
2208 {
2209     intptr_t i, oprsz = simd_oprsz(desc);
2210     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2211     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2212     int is_q = oprsz == 16;
2213     uint64_t n_4;
2214     float32 m_1;
2215 
2216     /*
2217      * Pre-load all of the f16 data, avoiding overlap issues.
2218      * Negate all inputs for AH=0 FMLSL at once.
2219      */
2220     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2221     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2222 
2223     for (i = 0; i < oprsz / 4; i++) {
2224         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2225         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2226     }
2227     clear_tail(d, oprsz, simd_maxsz(desc));
2228 }
2229 
2230 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2231                                 CPUARMState *env, uint32_t desc)
2232 {
2233     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2234     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2235 
2236     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, negx, 0, desc,
2237                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2238 }
2239 
2240 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2241                                 CPUARMState *env, uint32_t desc)
2242 {
2243     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2244     uint64_t negx = 0;
2245     int negf = 0;
2246 
2247     if (is_s) {
2248         if (env->vfp.fpcr & FPCR_AH) {
2249             negf = float_muladd_negate_product;
2250         } else {
2251             negx = 0x8000800080008000ull;
2252         }
2253     }
2254     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc,
2255                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2256 }
2257 
2258 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2259                                CPUARMState *env, uint32_t desc)
2260 {
2261     intptr_t i, j, oprsz = simd_oprsz(desc);
2262     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2263     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2264     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2265     float_status *status = &env->vfp.fp_status_a64;
2266     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2267     int negx = 0, negf = 0;
2268 
2269     if (is_s) {
2270         if (env->vfp.fpcr & FPCR_AH) {
2271             negf = float_muladd_negate_product;
2272         } else {
2273             negx = 0x8000;
2274         }
2275     }
2276 
2277     for (i = 0; i < oprsz; i += 16) {
2278         float16 mm_16 = *(float16 *)(vm + i + idx);
2279         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2280 
2281         for (j = 0; j < 16; j += sizeof(float32)) {
2282             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx;
2283             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2284             float32 aa = *(float32 *)(va + H1_4(i + j));
2285 
2286             *(float32 *)(vd + H1_4(i + j)) =
2287                 float32_muladd(nn, mm, aa, negf, status);
2288         }
2289     }
2290 }
2291 
2292 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2293 {
2294     intptr_t i, opr_sz = simd_oprsz(desc);
2295     int8_t *d = vd, *n = vn, *m = vm;
2296 
2297     for (i = 0; i < opr_sz; ++i) {
2298         int8_t mm = m[i];
2299         int8_t nn = n[i];
2300         int8_t res = 0;
2301         if (mm >= 0) {
2302             if (mm < 8) {
2303                 res = nn << mm;
2304             }
2305         } else {
2306             res = nn >> (mm > -8 ? -mm : 7);
2307         }
2308         d[i] = res;
2309     }
2310     clear_tail(d, opr_sz, simd_maxsz(desc));
2311 }
2312 
2313 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2314 {
2315     intptr_t i, opr_sz = simd_oprsz(desc);
2316     int16_t *d = vd, *n = vn, *m = vm;
2317 
2318     for (i = 0; i < opr_sz / 2; ++i) {
2319         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2320         int16_t nn = n[i];
2321         int16_t res = 0;
2322         if (mm >= 0) {
2323             if (mm < 16) {
2324                 res = nn << mm;
2325             }
2326         } else {
2327             res = nn >> (mm > -16 ? -mm : 15);
2328         }
2329         d[i] = res;
2330     }
2331     clear_tail(d, opr_sz, simd_maxsz(desc));
2332 }
2333 
2334 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2335 {
2336     intptr_t i, opr_sz = simd_oprsz(desc);
2337     uint8_t *d = vd, *n = vn, *m = vm;
2338 
2339     for (i = 0; i < opr_sz; ++i) {
2340         int8_t mm = m[i];
2341         uint8_t nn = n[i];
2342         uint8_t res = 0;
2343         if (mm >= 0) {
2344             if (mm < 8) {
2345                 res = nn << mm;
2346             }
2347         } else {
2348             if (mm > -8) {
2349                 res = nn >> -mm;
2350             }
2351         }
2352         d[i] = res;
2353     }
2354     clear_tail(d, opr_sz, simd_maxsz(desc));
2355 }
2356 
2357 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2358 {
2359     intptr_t i, opr_sz = simd_oprsz(desc);
2360     uint16_t *d = vd, *n = vn, *m = vm;
2361 
2362     for (i = 0; i < opr_sz / 2; ++i) {
2363         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2364         uint16_t nn = n[i];
2365         uint16_t res = 0;
2366         if (mm >= 0) {
2367             if (mm < 16) {
2368                 res = nn << mm;
2369             }
2370         } else {
2371             if (mm > -16) {
2372                 res = nn >> -mm;
2373             }
2374         }
2375         d[i] = res;
2376     }
2377     clear_tail(d, opr_sz, simd_maxsz(desc));
2378 }
2379 
2380 /*
2381  * 8x8->8 polynomial multiply.
2382  *
2383  * Polynomial multiplication is like integer multiplication except the
2384  * partial products are XORed, not added.
2385  *
2386  * TODO: expose this as a generic vector operation, as it is a common
2387  * crypto building block.
2388  */
2389 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2390 {
2391     intptr_t i, opr_sz = simd_oprsz(desc);
2392     uint64_t *d = vd, *n = vn, *m = vm;
2393 
2394     for (i = 0; i < opr_sz / 8; ++i) {
2395         d[i] = clmul_8x8_low(n[i], m[i]);
2396     }
2397     clear_tail(d, opr_sz, simd_maxsz(desc));
2398 }
2399 
2400 /*
2401  * 64x64->128 polynomial multiply.
2402  * Because of the lanes are not accessed in strict columns,
2403  * this probably cannot be turned into a generic helper.
2404  */
2405 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2406 {
2407     intptr_t i, opr_sz = simd_oprsz(desc);
2408     intptr_t hi = simd_data(desc);
2409     uint64_t *d = vd, *n = vn, *m = vm;
2410 
2411     for (i = 0; i < opr_sz / 8; i += 2) {
2412         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2413         d[i] = int128_getlo(r);
2414         d[i + 1] = int128_gethi(r);
2415     }
2416     clear_tail(d, opr_sz, simd_maxsz(desc));
2417 }
2418 
2419 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2420 {
2421     int hi = simd_data(desc);
2422     uint64_t *d = vd, *n = vn, *m = vm;
2423     uint64_t nn = n[hi], mm = m[hi];
2424 
2425     d[0] = clmul_8x4_packed(nn, mm);
2426     nn >>= 32;
2427     mm >>= 32;
2428     d[1] = clmul_8x4_packed(nn, mm);
2429 
2430     clear_tail(d, 16, simd_maxsz(desc));
2431 }
2432 
2433 #ifdef TARGET_AARCH64
2434 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2435 {
2436     int shift = simd_data(desc) * 8;
2437     intptr_t i, opr_sz = simd_oprsz(desc);
2438     uint64_t *d = vd, *n = vn, *m = vm;
2439 
2440     for (i = 0; i < opr_sz / 8; ++i) {
2441         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2442     }
2443 }
2444 
2445 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2446 {
2447     intptr_t sel = H4(simd_data(desc));
2448     intptr_t i, opr_sz = simd_oprsz(desc);
2449     uint32_t *n = vn, *m = vm;
2450     uint64_t *d = vd;
2451 
2452     for (i = 0; i < opr_sz / 8; ++i) {
2453         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2454     }
2455 }
2456 #endif
2457 
2458 #define DO_CMP0(NAME, TYPE, OP)                         \
2459 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2460 {                                                       \
2461     intptr_t i, opr_sz = simd_oprsz(desc);              \
2462     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2463         TYPE nn = *(TYPE *)(vn + i);                    \
2464         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2465     }                                                   \
2466     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2467 }
2468 
2469 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2470 DO_CMP0(gvec_clt0_b, int8_t, <)
2471 DO_CMP0(gvec_cle0_b, int8_t, <=)
2472 DO_CMP0(gvec_cgt0_b, int8_t, >)
2473 DO_CMP0(gvec_cge0_b, int8_t, >=)
2474 
2475 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2476 DO_CMP0(gvec_clt0_h, int16_t, <)
2477 DO_CMP0(gvec_cle0_h, int16_t, <=)
2478 DO_CMP0(gvec_cgt0_h, int16_t, >)
2479 DO_CMP0(gvec_cge0_h, int16_t, >=)
2480 
2481 #undef DO_CMP0
2482 
2483 #define DO_ABD(NAME, TYPE)                                      \
2484 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2485 {                                                               \
2486     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2487     TYPE *d = vd, *n = vn, *m = vm;                             \
2488                                                                 \
2489     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2490         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2491     }                                                           \
2492     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2493 }
2494 
2495 DO_ABD(gvec_sabd_b, int8_t)
2496 DO_ABD(gvec_sabd_h, int16_t)
2497 DO_ABD(gvec_sabd_s, int32_t)
2498 DO_ABD(gvec_sabd_d, int64_t)
2499 
2500 DO_ABD(gvec_uabd_b, uint8_t)
2501 DO_ABD(gvec_uabd_h, uint16_t)
2502 DO_ABD(gvec_uabd_s, uint32_t)
2503 DO_ABD(gvec_uabd_d, uint64_t)
2504 
2505 #undef DO_ABD
2506 
2507 #define DO_ABA(NAME, TYPE)                                      \
2508 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2509 {                                                               \
2510     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2511     TYPE *d = vd, *n = vn, *m = vm;                             \
2512                                                                 \
2513     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2514         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2515     }                                                           \
2516     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2517 }
2518 
2519 DO_ABA(gvec_saba_b, int8_t)
2520 DO_ABA(gvec_saba_h, int16_t)
2521 DO_ABA(gvec_saba_s, int32_t)
2522 DO_ABA(gvec_saba_d, int64_t)
2523 
2524 DO_ABA(gvec_uaba_b, uint8_t)
2525 DO_ABA(gvec_uaba_h, uint16_t)
2526 DO_ABA(gvec_uaba_s, uint32_t)
2527 DO_ABA(gvec_uaba_d, uint64_t)
2528 
2529 #undef DO_ABA
2530 
2531 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2532 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2533                   float_status *stat, uint32_t desc)                       \
2534 {                                                                          \
2535     ARMVectorReg scratch;                                                  \
2536     intptr_t oprsz = simd_oprsz(desc);                                     \
2537     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2538     TYPE *d = vd, *n = vn, *m = vm;                                        \
2539     if (unlikely(d == m)) {                                                \
2540         m = memcpy(&scratch, m, oprsz);                                    \
2541     }                                                                      \
2542     for (intptr_t i = 0; i < half; ++i) {                                  \
2543         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2544     }                                                                      \
2545     for (intptr_t i = 0; i < half; ++i) {                                  \
2546         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2547     }                                                                      \
2548     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2549 }
2550 
2551 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2552 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2553 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2554 
2555 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2556 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2557 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2558 
2559 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2560 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2561 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2562 
2563 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2564 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2565 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2566 
2567 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2568 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2569 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2570 
2571 #ifdef TARGET_AARCH64
2572 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2573 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2574 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2575 
2576 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2577 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2578 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2579 #endif
2580 
2581 #undef DO_3OP_PAIR
2582 
2583 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2584 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2585 {                                                               \
2586     ARMVectorReg scratch;                                       \
2587     intptr_t oprsz = simd_oprsz(desc);                          \
2588     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2589     TYPE *d = vd, *n = vn, *m = vm;                             \
2590     if (unlikely(d == m)) {                                     \
2591         m = memcpy(&scratch, m, oprsz);                         \
2592     }                                                           \
2593     for (intptr_t i = 0; i < half; ++i) {                       \
2594         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2595     }                                                           \
2596     for (intptr_t i = 0; i < half; ++i) {                       \
2597         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2598     }                                                           \
2599     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2600 }
2601 
2602 #define ADD(A, B) (A + B)
2603 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2604 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2605 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2606 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2607 #undef  ADD
2608 
2609 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2610 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2611 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2612 
2613 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2614 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2615 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2616 
2617 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2618 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2619 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2620 
2621 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2622 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2623 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2624 
2625 #undef DO_3OP_PAIR
2626 
2627 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2628     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2629     {                                                                   \
2630         intptr_t i, oprsz = simd_oprsz(desc);                           \
2631         int shift = simd_data(desc);                                    \
2632         TYPE *d = vd, *n = vn;                                          \
2633         float_status *fpst = stat;                                      \
2634         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2635             d[i] = FUNC(n[i], shift, fpst);                             \
2636         }                                                               \
2637         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2638     }
2639 
2640 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2641 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2642 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2643 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2644 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2645 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2646 
2647 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2648 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2649 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2650 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2651 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2652 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2653 
2654 #undef DO_VCVT_FIXED
2655 
2656 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2657     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2658     {                                                                   \
2659         intptr_t i, oprsz = simd_oprsz(desc);                           \
2660         uint32_t rmode = simd_data(desc);                               \
2661         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2662         TYPE *d = vd, *n = vn;                                          \
2663         set_float_rounding_mode(rmode, fpst);                           \
2664         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2665             d[i] = FUNC(n[i], 0, fpst);                                 \
2666         }                                                               \
2667         set_float_rounding_mode(prev_rmode, fpst);                      \
2668         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2669     }
2670 
2671 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2672 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2673 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2674 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2675 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2676 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2677 
2678 #undef DO_VCVT_RMODE
2679 
2680 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2681     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2682     {                                                                   \
2683         intptr_t i, oprsz = simd_oprsz(desc);                           \
2684         uint32_t rmode = simd_data(desc);                               \
2685         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2686         TYPE *d = vd, *n = vn;                                          \
2687         set_float_rounding_mode(rmode, fpst);                           \
2688         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2689             d[i] = FUNC(n[i], fpst);                                    \
2690         }                                                               \
2691         set_float_rounding_mode(prev_rmode, fpst);                      \
2692         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2693     }
2694 
2695 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2696 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2697 
2698 #undef DO_VRINT_RMODE
2699 
2700 #ifdef TARGET_AARCH64
2701 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2702 {
2703     const uint8_t *indices = vm;
2704     size_t oprsz = simd_oprsz(desc);
2705     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2706     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2707     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2708     union {
2709         uint8_t b[16];
2710         uint64_t d[2];
2711     } result;
2712 
2713     /*
2714      * We must construct the final result in a temp, lest the output
2715      * overlaps the input table.  For TBL, begin with zero; for TBX,
2716      * begin with the original register contents.  Note that we always
2717      * copy 16 bytes here to avoid an extra branch; clearing the high
2718      * bits of the register for oprsz == 8 is handled below.
2719      */
2720     if (is_tbx) {
2721         memcpy(&result, vd, 16);
2722     } else {
2723         memset(&result, 0, 16);
2724     }
2725 
2726     for (size_t i = 0; i < oprsz; ++i) {
2727         uint32_t index = indices[H1(i)];
2728 
2729         if (index < table_len) {
2730             /*
2731              * Convert index (a byte offset into the virtual table
2732              * which is a series of 128-bit vectors concatenated)
2733              * into the correct register element, bearing in mind
2734              * that the table can wrap around from V31 to V0.
2735              */
2736             const uint8_t *table = (const uint8_t *)
2737                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2738             result.b[H1(i)] = table[H1(index % 16)];
2739         }
2740     }
2741 
2742     memcpy(vd, &result, 16);
2743     clear_tail(vd, oprsz, simd_maxsz(desc));
2744 }
2745 #endif
2746 
2747 /*
2748  * NxN -> N highpart multiply
2749  *
2750  * TODO: expose this as a generic vector operation.
2751  */
2752 
2753 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2754 {
2755     intptr_t i, opr_sz = simd_oprsz(desc);
2756     int8_t *d = vd, *n = vn, *m = vm;
2757 
2758     for (i = 0; i < opr_sz; ++i) {
2759         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2760     }
2761     clear_tail(d, opr_sz, simd_maxsz(desc));
2762 }
2763 
2764 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2765 {
2766     intptr_t i, opr_sz = simd_oprsz(desc);
2767     int16_t *d = vd, *n = vn, *m = vm;
2768 
2769     for (i = 0; i < opr_sz / 2; ++i) {
2770         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2771     }
2772     clear_tail(d, opr_sz, simd_maxsz(desc));
2773 }
2774 
2775 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2776 {
2777     intptr_t i, opr_sz = simd_oprsz(desc);
2778     int32_t *d = vd, *n = vn, *m = vm;
2779 
2780     for (i = 0; i < opr_sz / 4; ++i) {
2781         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2782     }
2783     clear_tail(d, opr_sz, simd_maxsz(desc));
2784 }
2785 
2786 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2787 {
2788     intptr_t i, opr_sz = simd_oprsz(desc);
2789     uint64_t *d = vd, *n = vn, *m = vm;
2790     uint64_t discard;
2791 
2792     for (i = 0; i < opr_sz / 8; ++i) {
2793         muls64(&discard, &d[i], n[i], m[i]);
2794     }
2795     clear_tail(d, opr_sz, simd_maxsz(desc));
2796 }
2797 
2798 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2799 {
2800     intptr_t i, opr_sz = simd_oprsz(desc);
2801     uint8_t *d = vd, *n = vn, *m = vm;
2802 
2803     for (i = 0; i < opr_sz; ++i) {
2804         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2805     }
2806     clear_tail(d, opr_sz, simd_maxsz(desc));
2807 }
2808 
2809 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2810 {
2811     intptr_t i, opr_sz = simd_oprsz(desc);
2812     uint16_t *d = vd, *n = vn, *m = vm;
2813 
2814     for (i = 0; i < opr_sz / 2; ++i) {
2815         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2816     }
2817     clear_tail(d, opr_sz, simd_maxsz(desc));
2818 }
2819 
2820 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2821 {
2822     intptr_t i, opr_sz = simd_oprsz(desc);
2823     uint32_t *d = vd, *n = vn, *m = vm;
2824 
2825     for (i = 0; i < opr_sz / 4; ++i) {
2826         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2827     }
2828     clear_tail(d, opr_sz, simd_maxsz(desc));
2829 }
2830 
2831 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2832 {
2833     intptr_t i, opr_sz = simd_oprsz(desc);
2834     uint64_t *d = vd, *n = vn, *m = vm;
2835     uint64_t discard;
2836 
2837     for (i = 0; i < opr_sz / 8; ++i) {
2838         mulu64(&discard, &d[i], n[i], m[i]);
2839     }
2840     clear_tail(d, opr_sz, simd_maxsz(desc));
2841 }
2842 
2843 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2844 {
2845     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2846     int shr = simd_data(desc);
2847     uint64_t *d = vd, *n = vn, *m = vm;
2848 
2849     for (i = 0; i < opr_sz; ++i) {
2850         d[i] = ror64(n[i] ^ m[i], shr);
2851     }
2852     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2853 }
2854 
2855 /*
2856  * Integer matrix-multiply accumulate
2857  */
2858 
2859 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2860 {
2861     int8_t *n = vn, *m = vm;
2862 
2863     for (intptr_t k = 0; k < 8; ++k) {
2864         sum += n[H1(k)] * m[H1(k)];
2865     }
2866     return sum;
2867 }
2868 
2869 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2870 {
2871     uint8_t *n = vn, *m = vm;
2872 
2873     for (intptr_t k = 0; k < 8; ++k) {
2874         sum += n[H1(k)] * m[H1(k)];
2875     }
2876     return sum;
2877 }
2878 
2879 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2880 {
2881     uint8_t *n = vn;
2882     int8_t *m = vm;
2883 
2884     for (intptr_t k = 0; k < 8; ++k) {
2885         sum += n[H1(k)] * m[H1(k)];
2886     }
2887     return sum;
2888 }
2889 
2890 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2891                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2892 {
2893     intptr_t seg, opr_sz = simd_oprsz(desc);
2894 
2895     for (seg = 0; seg < opr_sz; seg += 16) {
2896         uint32_t *d = vd + seg;
2897         uint32_t *a = va + seg;
2898         uint32_t sum0, sum1, sum2, sum3;
2899 
2900         /*
2901          * Process the entire segment at once, writing back the
2902          * results only after we've consumed all of the inputs.
2903          *
2904          * Key to indices by column:
2905          *          i   j                  i             j
2906          */
2907         sum0 = a[H4(0 + 0)];
2908         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2909         sum1 = a[H4(0 + 1)];
2910         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2911         sum2 = a[H4(2 + 0)];
2912         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2913         sum3 = a[H4(2 + 1)];
2914         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2915 
2916         d[H4(0)] = sum0;
2917         d[H4(1)] = sum1;
2918         d[H4(2)] = sum2;
2919         d[H4(3)] = sum3;
2920     }
2921     clear_tail(vd, opr_sz, simd_maxsz(desc));
2922 }
2923 
2924 #define DO_MMLA_B(NAME, INNER) \
2925     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2926     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2927 
2928 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2929 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2930 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2931 
2932 /*
2933  * BFloat16 Dot Product
2934  */
2935 
2936 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2937 {
2938     /*
2939      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2940      * For EBF = 0, we ignore the FPCR bits which determine rounding
2941      * mode and denormal-flushing, and we do unfused multiplies and
2942      * additions with intermediate rounding of all products and sums.
2943      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2944      * and we perform a fused two-way sum-of-products without intermediate
2945      * rounding of the products.
2946      * In either case, we don't set fp exception flags.
2947      *
2948      * EBF is AArch64 only, so even if it's set in the FPCR it has
2949      * no effect on AArch32 instructions.
2950      */
2951     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2952 
2953     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2954     set_default_nan_mode(true, statusp);
2955 
2956     if (ebf) {
2957         /* EBF=1 needs to do a step with round-to-odd semantics */
2958         *oddstatusp = *statusp;
2959         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2960     } else {
2961         set_flush_to_zero(true, statusp);
2962         set_flush_inputs_to_zero(true, statusp);
2963         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2964     }
2965     return ebf;
2966 }
2967 
2968 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2969 {
2970     float32 t1, t2;
2971 
2972     /*
2973      * Extract each BFloat16 from the element pair, and shift
2974      * them such that they become float32.
2975      */
2976     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2977     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2978     t1 = float32_add(t1, t2, fpst);
2979     t1 = float32_add(sum, t1, fpst);
2980 
2981     return t1;
2982 }
2983 
2984 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2985                      float_status *fpst, float_status *fpst_odd)
2986 {
2987     /*
2988      * Compare f16_dotadd() in sme_helper.c, but here we have
2989      * bfloat16 inputs. In particular that means that we do not
2990      * want the FPCR.FZ16 flush semantics, so we use the normal
2991      * float_status for the input handling here.
2992      */
2993     float64 e1r = float32_to_float64(e1 << 16, fpst);
2994     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2995     float64 e2r = float32_to_float64(e2 << 16, fpst);
2996     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2997     float64 t64;
2998     float32 t32;
2999 
3000     /*
3001      * The ARM pseudocode function FPDot performs both multiplies
3002      * and the add with a single rounding operation.  Emulate this
3003      * by performing the first multiply in round-to-odd, then doing
3004      * the second multiply as fused multiply-add, and rounding to
3005      * float32 all in one step.
3006      */
3007     t64 = float64_mul(e1r, e2r, fpst_odd);
3008     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
3009 
3010     /* This conversion is exact, because we've already rounded. */
3011     t32 = float64_to_float32(t64, fpst);
3012 
3013     /* The final accumulation step is not fused. */
3014     return float32_add(sum, t32, fpst);
3015 }
3016 
3017 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
3018                         CPUARMState *env, uint32_t desc)
3019 {
3020     intptr_t i, opr_sz = simd_oprsz(desc);
3021     float32 *d = vd, *a = va;
3022     uint32_t *n = vn, *m = vm;
3023     float_status fpst, fpst_odd;
3024 
3025     if (is_ebf(env, &fpst, &fpst_odd)) {
3026         for (i = 0; i < opr_sz / 4; ++i) {
3027             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
3028         }
3029     } else {
3030         for (i = 0; i < opr_sz / 4; ++i) {
3031             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
3032         }
3033     }
3034     clear_tail(d, opr_sz, simd_maxsz(desc));
3035 }
3036 
3037 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
3038                             void *va, CPUARMState *env, uint32_t desc)
3039 {
3040     intptr_t i, j, opr_sz = simd_oprsz(desc);
3041     intptr_t index = simd_data(desc);
3042     intptr_t elements = opr_sz / 4;
3043     intptr_t eltspersegment = MIN(16 / 4, elements);
3044     float32 *d = vd, *a = va;
3045     uint32_t *n = vn, *m = vm;
3046     float_status fpst, fpst_odd;
3047 
3048     if (is_ebf(env, &fpst, &fpst_odd)) {
3049         for (i = 0; i < elements; i += eltspersegment) {
3050             uint32_t m_idx = m[i + H4(index)];
3051 
3052             for (j = i; j < i + eltspersegment; j++) {
3053                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3054             }
3055         }
3056     } else {
3057         for (i = 0; i < elements; i += eltspersegment) {
3058             uint32_t m_idx = m[i + H4(index)];
3059 
3060             for (j = i; j < i + eltspersegment; j++) {
3061                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3062             }
3063         }
3064     }
3065     clear_tail(d, opr_sz, simd_maxsz(desc));
3066 }
3067 
3068 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3069                          CPUARMState *env, uint32_t desc)
3070 {
3071     intptr_t s, opr_sz = simd_oprsz(desc);
3072     float32 *d = vd, *a = va;
3073     uint32_t *n = vn, *m = vm;
3074     float_status fpst, fpst_odd;
3075 
3076     if (is_ebf(env, &fpst, &fpst_odd)) {
3077         for (s = 0; s < opr_sz / 4; s += 4) {
3078             float32 sum00, sum01, sum10, sum11;
3079 
3080             /*
3081              * Process the entire segment at once, writing back the
3082              * results only after we've consumed all of the inputs.
3083              *
3084              * Key to indices by column:
3085              *               i   j               i   k             j   k
3086              */
3087             sum00 = a[s + H4(0 + 0)];
3088             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3089             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3090 
3091             sum01 = a[s + H4(0 + 1)];
3092             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3093             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3094 
3095             sum10 = a[s + H4(2 + 0)];
3096             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3097             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3098 
3099             sum11 = a[s + H4(2 + 1)];
3100             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3101             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3102 
3103             d[s + H4(0 + 0)] = sum00;
3104             d[s + H4(0 + 1)] = sum01;
3105             d[s + H4(2 + 0)] = sum10;
3106             d[s + H4(2 + 1)] = sum11;
3107         }
3108     } else {
3109         for (s = 0; s < opr_sz / 4; s += 4) {
3110             float32 sum00, sum01, sum10, sum11;
3111 
3112             /*
3113              * Process the entire segment at once, writing back the
3114              * results only after we've consumed all of the inputs.
3115              *
3116              * Key to indices by column:
3117              *               i   j           i   k             j   k
3118              */
3119             sum00 = a[s + H4(0 + 0)];
3120             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3121             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3122 
3123             sum01 = a[s + H4(0 + 1)];
3124             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3125             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3126 
3127             sum10 = a[s + H4(2 + 0)];
3128             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3129             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3130 
3131             sum11 = a[s + H4(2 + 1)];
3132             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3133             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3134 
3135             d[s + H4(0 + 0)] = sum00;
3136             d[s + H4(0 + 1)] = sum01;
3137             d[s + H4(2 + 0)] = sum10;
3138             d[s + H4(2 + 1)] = sum11;
3139         }
3140     }
3141     clear_tail(d, opr_sz, simd_maxsz(desc));
3142 }
3143 
3144 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3145                          float_status *stat, uint32_t desc)
3146 {
3147     intptr_t i, opr_sz = simd_oprsz(desc);
3148     intptr_t sel = simd_data(desc);
3149     float32 *d = vd, *a = va;
3150     bfloat16 *n = vn, *m = vm;
3151 
3152     for (i = 0; i < opr_sz / 4; ++i) {
3153         float32 nn = n[H2(i * 2 + sel)] << 16;
3154         float32 mm = m[H2(i * 2 + sel)] << 16;
3155         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3156     }
3157     clear_tail(d, opr_sz, simd_maxsz(desc));
3158 }
3159 
3160 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3161                              void *va, float_status *stat, uint32_t desc)
3162 {
3163     intptr_t i, j, opr_sz = simd_oprsz(desc);
3164     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3165     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3166     intptr_t elements = opr_sz / 4;
3167     intptr_t eltspersegment = MIN(16 / 4, elements);
3168     float32 *d = vd, *a = va;
3169     bfloat16 *n = vn, *m = vm;
3170 
3171     for (i = 0; i < elements; i += eltspersegment) {
3172         float32 m_idx = m[H2(2 * i + index)] << 16;
3173 
3174         for (j = i; j < i + eltspersegment; j++) {
3175             float32 n_j = n[H2(2 * j + sel)] << 16;
3176             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3177         }
3178     }
3179     clear_tail(d, opr_sz, simd_maxsz(desc));
3180 }
3181 
3182 #define DO_CLAMP(NAME, TYPE) \
3183 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3184 {                                                                       \
3185     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3186     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3187         TYPE aa = *(TYPE *)(a + i);                                     \
3188         TYPE nn = *(TYPE *)(n + i);                                     \
3189         TYPE mm = *(TYPE *)(m + i);                                     \
3190         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3191         *(TYPE *)(d + i) = dd;                                          \
3192     }                                                                   \
3193     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3194 }
3195 
3196 DO_CLAMP(gvec_sclamp_b, int8_t)
3197 DO_CLAMP(gvec_sclamp_h, int16_t)
3198 DO_CLAMP(gvec_sclamp_s, int32_t)
3199 DO_CLAMP(gvec_sclamp_d, int64_t)
3200 
3201 DO_CLAMP(gvec_uclamp_b, uint8_t)
3202 DO_CLAMP(gvec_uclamp_h, uint16_t)
3203 DO_CLAMP(gvec_uclamp_s, uint32_t)
3204 DO_CLAMP(gvec_uclamp_d, uint64_t)
3205 
3206 /* Bit count in each 8-bit word. */
3207 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3208 {
3209     intptr_t i, opr_sz = simd_oprsz(desc);
3210     uint8_t *d = vd, *n = vn;
3211 
3212     for (i = 0; i < opr_sz; ++i) {
3213         d[i] = ctpop8(n[i]);
3214     }
3215     clear_tail(d, opr_sz, simd_maxsz(desc));
3216 }
3217 
3218 /* Reverse bits in each 8 bit word */
3219 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3220 {
3221     intptr_t i, opr_sz = simd_oprsz(desc);
3222     uint64_t *d = vd, *n = vn;
3223 
3224     for (i = 0; i < opr_sz / 8; ++i) {
3225         d[i] = revbit64(bswap64(n[i]));
3226     }
3227     clear_tail(d, opr_sz, simd_maxsz(desc));
3228 }
3229 
3230 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3231 {
3232     intptr_t i, opr_sz = simd_oprsz(desc);
3233     uint32_t *d = vd, *n = vn;
3234 
3235     for (i = 0; i < opr_sz / 4; ++i) {
3236         d[i] = helper_recpe_u32(n[i]);
3237     }
3238     clear_tail(d, opr_sz, simd_maxsz(desc));
3239 }
3240 
3241 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3242 {
3243     intptr_t i, opr_sz = simd_oprsz(desc);
3244     uint32_t *d = vd, *n = vn;
3245 
3246     for (i = 0; i < opr_sz / 4; ++i) {
3247         d[i] = helper_rsqrte_u32(n[i]);
3248     }
3249     clear_tail(d, opr_sz, simd_maxsz(desc));
3250 }
3251