xref: /qemu/target/arm/tcg/vec_helper.c (revision 6d5ccfd44fdcf606ceadaab7c90d44ea7d5f4fe5)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
883     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
884     uintptr_t i;
885 
886     for (i = 0; i < opr_sz / 2; i += 2) {
887         float16 e0 = n[H2(i)];
888         float16 e1 = m[H2(i + 1)];
889         float16 e2 = n[H2(i + 1)];
890         float16 e3 = m[H2(i)];
891 
892         if (rot) {
893             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
894         } else {
895             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
896         }
897 
898         d[H2(i)] = float16_add(e0, e1, fpst);
899         d[H2(i + 1)] = float16_add(e2, e3, fpst);
900     }
901     clear_tail(d, opr_sz, simd_maxsz(desc));
902 }
903 
904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
905                          float_status *fpst, uint32_t desc)
906 {
907     uintptr_t opr_sz = simd_oprsz(desc);
908     float32 *d = vd;
909     float32 *n = vn;
910     float32 *m = vm;
911     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
912     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
913     uintptr_t i;
914 
915     for (i = 0; i < opr_sz / 4; i += 2) {
916         float32 e0 = n[H4(i)];
917         float32 e1 = m[H4(i + 1)];
918         float32 e2 = n[H4(i + 1)];
919         float32 e3 = m[H4(i)];
920 
921         if (rot) {
922             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
923         } else {
924             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
925         }
926 
927         d[H4(i)] = float32_add(e0, e1, fpst);
928         d[H4(i + 1)] = float32_add(e2, e3, fpst);
929     }
930     clear_tail(d, opr_sz, simd_maxsz(desc));
931 }
932 
933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
934                          float_status *fpst, uint32_t desc)
935 {
936     uintptr_t opr_sz = simd_oprsz(desc);
937     float64 *d = vd;
938     float64 *n = vn;
939     float64 *m = vm;
940     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
941     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
942     uintptr_t i;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1];
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i];
949 
950         if (rot) {
951             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
952         } else {
953             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
954         }
955 
956         d[i] = float64_add(e0, e1, fpst);
957         d[i + 1] = float64_add(e2, e3, fpst);
958     }
959     clear_tail(d, opr_sz, simd_maxsz(desc));
960 }
961 
962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
963                          float_status *fpst, uint32_t desc)
964 {
965     uintptr_t opr_sz = simd_oprsz(desc);
966     float16 *d = vd, *n = vn, *m = vm, *a = va;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
969     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
970     uint32_t negf_real = flip ^ negf_imag;
971     float16 negx_imag, negx_real;
972     uintptr_t i;
973 
974     /* With AH=0, use negx; with AH=1 use negf. */
975     negx_real = (negf_real & ~fpcr_ah) << 15;
976     negx_imag = (negf_imag & ~fpcr_ah) << 15;
977     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
978     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
979 
980     for (i = 0; i < opr_sz / 2; i += 2) {
981         float16 e2 = n[H2(i + flip)];
982         float16 e1 = m[H2(i + flip)] ^ negx_real;
983         float16 e4 = e2;
984         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
985 
986         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
987         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
988     }
989     clear_tail(d, opr_sz, simd_maxsz(desc));
990 }
991 
992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
993                              float_status *fpst, uint32_t desc)
994 {
995     uintptr_t opr_sz = simd_oprsz(desc);
996     float16 *d = vd, *n = vn, *m = vm, *a = va;
997     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
998     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
999     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1000     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1001     uint32_t negf_real = flip ^ negf_imag;
1002     intptr_t elements = opr_sz / sizeof(float16);
1003     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1004     float16 negx_imag, negx_real;
1005     intptr_t i, j;
1006 
1007     /* With AH=0, use negx; with AH=1 use negf. */
1008     negx_real = (negf_real & ~fpcr_ah) << 15;
1009     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1010     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1011     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1012 
1013     for (i = 0; i < elements; i += eltspersegment) {
1014         float16 mr = m[H2(i + 2 * index + 0)];
1015         float16 mi = m[H2(i + 2 * index + 1)];
1016         float16 e1 = negx_real ^ (flip ? mi : mr);
1017         float16 e3 = negx_imag ^ (flip ? mr : mi);
1018 
1019         for (j = i; j < i + eltspersegment; j += 2) {
1020             float16 e2 = n[H2(j + flip)];
1021             float16 e4 = e2;
1022 
1023             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst);
1024             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst);
1025         }
1026     }
1027     clear_tail(d, opr_sz, simd_maxsz(desc));
1028 }
1029 
1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1031                          float_status *fpst, uint32_t desc)
1032 {
1033     uintptr_t opr_sz = simd_oprsz(desc);
1034     float32 *d = vd, *n = vn, *m = vm, *a = va;
1035     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1036     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1037     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1038     uint32_t negf_real = flip ^ negf_imag;
1039     float32 negx_imag, negx_real;
1040     uintptr_t i;
1041 
1042     /* With AH=0, use negx; with AH=1 use negf. */
1043     negx_real = (negf_real & ~fpcr_ah) << 31;
1044     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1045     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1046     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1047 
1048     for (i = 0; i < opr_sz / 4; i += 2) {
1049         float32 e2 = n[H4(i + flip)];
1050         float32 e1 = m[H4(i + flip)] ^ negx_real;
1051         float32 e4 = e2;
1052         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1053 
1054         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1055         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1056     }
1057     clear_tail(d, opr_sz, simd_maxsz(desc));
1058 }
1059 
1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1061                              float_status *fpst, uint32_t desc)
1062 {
1063     uintptr_t opr_sz = simd_oprsz(desc);
1064     float32 *d = vd, *n = vn, *m = vm, *a = va;
1065     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1066     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1067     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1068     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1069     uint32_t negf_real = flip ^ negf_imag;
1070     intptr_t elements = opr_sz / sizeof(float32);
1071     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1072     float32 negx_imag, negx_real;
1073     intptr_t i, j;
1074 
1075     /* With AH=0, use negx; with AH=1 use negf. */
1076     negx_real = (negf_real & ~fpcr_ah) << 31;
1077     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1078     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1079     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1080 
1081     for (i = 0; i < elements; i += eltspersegment) {
1082         float32 mr = m[H4(i + 2 * index + 0)];
1083         float32 mi = m[H4(i + 2 * index + 1)];
1084         float32 e1 = negx_real ^ (flip ? mi : mr);
1085         float32 e3 = negx_imag ^ (flip ? mr : mi);
1086 
1087         for (j = i; j < i + eltspersegment; j += 2) {
1088             float32 e2 = n[H4(j + flip)];
1089             float32 e4 = e2;
1090 
1091             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst);
1092             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst);
1093         }
1094     }
1095     clear_tail(d, opr_sz, simd_maxsz(desc));
1096 }
1097 
1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1099                          float_status *fpst, uint32_t desc)
1100 {
1101     uintptr_t opr_sz = simd_oprsz(desc);
1102     float64 *d = vd, *n = vn, *m = vm, *a = va;
1103     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1104     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1105     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1106     uint32_t negf_real = flip ^ negf_imag;
1107     float64 negx_real, negx_imag;
1108     uintptr_t i;
1109 
1110     /* With AH=0, use negx; with AH=1 use negf. */
1111     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1112     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1113     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1114     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1115 
1116     for (i = 0; i < opr_sz / 8; i += 2) {
1117         float64 e2 = n[i + flip];
1118         float64 e1 = m[i + flip] ^ negx_real;
1119         float64 e4 = e2;
1120         float64 e3 = m[i + 1 - flip] ^ negx_imag;
1121 
1122         d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1123         d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1124     }
1125     clear_tail(d, opr_sz, simd_maxsz(desc));
1126 }
1127 
1128 /*
1129  * Floating point comparisons producing an integer result (all 1s or all 0s).
1130  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1131  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1132  */
1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1134 {
1135     return -float16_eq_quiet(op1, op2, stat);
1136 }
1137 
1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1139 {
1140     return -float32_eq_quiet(op1, op2, stat);
1141 }
1142 
1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1144 {
1145     return -float64_eq_quiet(op1, op2, stat);
1146 }
1147 
1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1149 {
1150     return -float16_le(op2, op1, stat);
1151 }
1152 
1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1154 {
1155     return -float32_le(op2, op1, stat);
1156 }
1157 
1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1159 {
1160     return -float64_le(op2, op1, stat);
1161 }
1162 
1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1164 {
1165     return -float16_lt(op2, op1, stat);
1166 }
1167 
1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1169 {
1170     return -float32_lt(op2, op1, stat);
1171 }
1172 
1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1174 {
1175     return -float64_lt(op2, op1, stat);
1176 }
1177 
1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1179 {
1180     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1181 }
1182 
1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1184 {
1185     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1186 }
1187 
1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1189 {
1190     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1191 }
1192 
1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1194 {
1195     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1196 }
1197 
1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1199 {
1200     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1201 }
1202 
1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1204 {
1205     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1206 }
1207 
1208 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1209 {
1210     if (float16_is_any_nan(x)) {
1211         float_raise(float_flag_invalid, fpst);
1212         return 0;
1213     }
1214     return float16_to_int16_round_to_zero(x, fpst);
1215 }
1216 
1217 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1218 {
1219     if (float16_is_any_nan(x)) {
1220         float_raise(float_flag_invalid, fpst);
1221         return 0;
1222     }
1223     return float16_to_uint16_round_to_zero(x, fpst);
1224 }
1225 
1226 #define DO_2OP(NAME, FUNC, TYPE) \
1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1228 {                                                                 \
1229     intptr_t i, oprsz = simd_oprsz(desc);                         \
1230     TYPE *d = vd, *n = vn;                                        \
1231     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1232         d[i] = FUNC(n[i], stat);                                  \
1233     }                                                             \
1234     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1235 }
1236 
1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1239 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1240 
1241 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1242 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1243 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1244 
1245 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1246 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1247 
1248 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1249 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1250 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1251 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1252 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1253 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1254 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1255 DO_2OP(gvec_touszh, vfp_touszh, float16)
1256 
1257 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1258     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1259     {                                                           \
1260         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1261     }
1262 
1263 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1264     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1265     {                                                           \
1266         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1267     }
1268 
1269 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1270     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1271     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1272     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1273     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1274     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1275     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1276 
1277 DO_2OP_CMP0(cgt, cgt, FWD)
1278 DO_2OP_CMP0(cge, cge, FWD)
1279 DO_2OP_CMP0(ceq, ceq, FWD)
1280 DO_2OP_CMP0(clt, cgt, REV)
1281 DO_2OP_CMP0(cle, cge, REV)
1282 
1283 #undef DO_2OP
1284 #undef DO_2OP_CMP0
1285 
1286 /* Floating-point trigonometric starting value.
1287  * See the ARM ARM pseudocode function FPTrigSMul.
1288  */
1289 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1290 {
1291     float16 result = float16_mul(op1, op1, stat);
1292     if (!float16_is_any_nan(result)) {
1293         result = float16_set_sign(result, op2 & 1);
1294     }
1295     return result;
1296 }
1297 
1298 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1299 {
1300     float32 result = float32_mul(op1, op1, stat);
1301     if (!float32_is_any_nan(result)) {
1302         result = float32_set_sign(result, op2 & 1);
1303     }
1304     return result;
1305 }
1306 
1307 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1308 {
1309     float64 result = float64_mul(op1, op1, stat);
1310     if (!float64_is_any_nan(result)) {
1311         result = float64_set_sign(result, op2 & 1);
1312     }
1313     return result;
1314 }
1315 
1316 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1317 {
1318     return float16_abs(float16_sub(op1, op2, stat));
1319 }
1320 
1321 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1322 {
1323     return float32_abs(float32_sub(op1, op2, stat));
1324 }
1325 
1326 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1327 {
1328     return float64_abs(float64_sub(op1, op2, stat));
1329 }
1330 
1331 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1332 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1333 {
1334     float16 r = float16_sub(op1, op2, stat);
1335     return float16_is_any_nan(r) ? r : float16_abs(r);
1336 }
1337 
1338 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1339 {
1340     float32 r = float32_sub(op1, op2, stat);
1341     return float32_is_any_nan(r) ? r : float32_abs(r);
1342 }
1343 
1344 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1345 {
1346     float64 r = float64_sub(op1, op2, stat);
1347     return float64_is_any_nan(r) ? r : float64_abs(r);
1348 }
1349 
1350 /*
1351  * Reciprocal step. These are the AArch32 version which uses a
1352  * non-fused multiply-and-subtract.
1353  */
1354 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1355 {
1356     op1 = float16_squash_input_denormal(op1, stat);
1357     op2 = float16_squash_input_denormal(op2, stat);
1358 
1359     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1360         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1361         return float16_two;
1362     }
1363     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1364 }
1365 
1366 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1367 {
1368     op1 = float32_squash_input_denormal(op1, stat);
1369     op2 = float32_squash_input_denormal(op2, stat);
1370 
1371     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1372         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1373         return float32_two;
1374     }
1375     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1376 }
1377 
1378 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1379 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1380 {
1381     op1 = float16_squash_input_denormal(op1, stat);
1382     op2 = float16_squash_input_denormal(op2, stat);
1383 
1384     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1385         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1386         return float16_one_point_five;
1387     }
1388     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1389     return float16_div(op1, float16_two, stat);
1390 }
1391 
1392 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1393 {
1394     op1 = float32_squash_input_denormal(op1, stat);
1395     op2 = float32_squash_input_denormal(op2, stat);
1396 
1397     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1398         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1399         return float32_one_point_five;
1400     }
1401     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1402     return float32_div(op1, float32_two, stat);
1403 }
1404 
1405 #define DO_3OP(NAME, FUNC, TYPE) \
1406 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1407                   float_status *stat, uint32_t desc)                       \
1408 {                                                                          \
1409     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1410     TYPE *d = vd, *n = vn, *m = vm;                                        \
1411     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1412         d[i] = FUNC(n[i], m[i], stat);                                     \
1413     }                                                                      \
1414     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1415 }
1416 
1417 DO_3OP(gvec_fadd_h, float16_add, float16)
1418 DO_3OP(gvec_fadd_s, float32_add, float32)
1419 DO_3OP(gvec_fadd_d, float64_add, float64)
1420 
1421 DO_3OP(gvec_fsub_h, float16_sub, float16)
1422 DO_3OP(gvec_fsub_s, float32_sub, float32)
1423 DO_3OP(gvec_fsub_d, float64_sub, float64)
1424 
1425 DO_3OP(gvec_fmul_h, float16_mul, float16)
1426 DO_3OP(gvec_fmul_s, float32_mul, float32)
1427 DO_3OP(gvec_fmul_d, float64_mul, float64)
1428 
1429 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1430 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1431 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1432 
1433 DO_3OP(gvec_fabd_h, float16_abd, float16)
1434 DO_3OP(gvec_fabd_s, float32_abd, float32)
1435 DO_3OP(gvec_fabd_d, float64_abd, float64)
1436 
1437 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1438 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1439 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1440 
1441 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1442 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1443 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1444 
1445 DO_3OP(gvec_fcge_h, float16_cge, float16)
1446 DO_3OP(gvec_fcge_s, float32_cge, float32)
1447 DO_3OP(gvec_fcge_d, float64_cge, float64)
1448 
1449 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1450 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1451 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1452 
1453 DO_3OP(gvec_facge_h, float16_acge, float16)
1454 DO_3OP(gvec_facge_s, float32_acge, float32)
1455 DO_3OP(gvec_facge_d, float64_acge, float64)
1456 
1457 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1458 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1459 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1460 
1461 DO_3OP(gvec_fmax_h, float16_max, float16)
1462 DO_3OP(gvec_fmax_s, float32_max, float32)
1463 DO_3OP(gvec_fmax_d, float64_max, float64)
1464 
1465 DO_3OP(gvec_fmin_h, float16_min, float16)
1466 DO_3OP(gvec_fmin_s, float32_min, float32)
1467 DO_3OP(gvec_fmin_d, float64_min, float64)
1468 
1469 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1470 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1471 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1472 
1473 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1474 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1475 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1476 
1477 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1478 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1479 
1480 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1481 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1482 
1483 #ifdef TARGET_AARCH64
1484 DO_3OP(gvec_fdiv_h, float16_div, float16)
1485 DO_3OP(gvec_fdiv_s, float32_div, float32)
1486 DO_3OP(gvec_fdiv_d, float64_div, float64)
1487 
1488 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1489 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1490 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1491 
1492 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1493 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1494 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1495 
1496 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1497 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1498 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1499 
1500 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1501 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1502 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1503 
1504 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1505 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1506 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1507 
1508 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1509 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1510 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1511 
1512 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1513 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1514 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1515 
1516 #endif
1517 #undef DO_3OP
1518 
1519 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1520 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1521                                  float_status *stat)
1522 {
1523     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1524 }
1525 
1526 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1527                                  float_status *stat)
1528 {
1529     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1530 }
1531 
1532 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1533                                  float_status *stat)
1534 {
1535     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1536 }
1537 
1538 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1539                                  float_status *stat)
1540 {
1541     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1542 }
1543 
1544 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1545 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1546                                 float_status *stat)
1547 {
1548     return float16_muladd(op1, op2, dest, 0, stat);
1549 }
1550 
1551 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1552                                  float_status *stat)
1553 {
1554     return float32_muladd(op1, op2, dest, 0, stat);
1555 }
1556 
1557 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1558                                  float_status *stat)
1559 {
1560     return float64_muladd(op1, op2, dest, 0, stat);
1561 }
1562 
1563 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1564                                  float_status *stat)
1565 {
1566     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1567 }
1568 
1569 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1570                                  float_status *stat)
1571 {
1572     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1573 }
1574 
1575 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1576                                  float_status *stat)
1577 {
1578     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1579 }
1580 
1581 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1582                                  float_status *stat)
1583 {
1584     return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1585 }
1586 
1587 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1588                                  float_status *stat)
1589 {
1590     return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1591 }
1592 
1593 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1594                                  float_status *stat)
1595 {
1596     return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1597 }
1598 
1599 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1600 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1601                   float_status *stat, uint32_t desc)                       \
1602 {                                                                          \
1603     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1604     TYPE *d = vd, *n = vn, *m = vm;                                        \
1605     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1606         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1607     }                                                                      \
1608     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1609 }
1610 
1611 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1612 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1613 
1614 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1615 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1616 
1617 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1618 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1619 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1620 
1621 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1622 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1623 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1624 
1625 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1626 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1627 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1628 
1629 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1630  * For AdvSIMD, there is of course only one such vector segment.
1631  */
1632 
1633 #define DO_MUL_IDX(NAME, TYPE, H) \
1634 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1635 {                                                                          \
1636     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1637     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1638     intptr_t idx = simd_data(desc);                                        \
1639     TYPE *d = vd, *n = vn, *m = vm;                                        \
1640     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1641         TYPE mm = m[H(i + idx)];                                           \
1642         for (j = 0; j < segment; j++) {                                    \
1643             d[i + j] = n[i + j] * mm;                                      \
1644         }                                                                  \
1645     }                                                                      \
1646     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1647 }
1648 
1649 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1650 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1651 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1652 
1653 #undef DO_MUL_IDX
1654 
1655 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1656 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1657 {                                                                          \
1658     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1659     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1660     intptr_t idx = simd_data(desc);                                        \
1661     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1662     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1663         TYPE mm = m[H(i + idx)];                                           \
1664         for (j = 0; j < segment; j++) {                                    \
1665             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1666         }                                                                  \
1667     }                                                                      \
1668     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1669 }
1670 
1671 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1672 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1673 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1674 
1675 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1676 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1677 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1678 
1679 #undef DO_MLA_IDX
1680 
1681 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1682 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1683                   float_status *stat, uint32_t desc)                       \
1684 {                                                                          \
1685     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1686     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1687     intptr_t idx = simd_data(desc);                                        \
1688     TYPE *d = vd, *n = vn, *m = vm;                                        \
1689     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1690         TYPE mm = m[H(i + idx)];                                           \
1691         for (j = 0; j < segment; j++) {                                    \
1692             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1693         }                                                                  \
1694     }                                                                      \
1695     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1696 }
1697 
1698 #define nop(N, M, S) (M)
1699 
1700 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1701 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1702 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1703 
1704 #ifdef TARGET_AARCH64
1705 
1706 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1707 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1708 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1709 
1710 #endif
1711 
1712 #undef nop
1713 
1714 /*
1715  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1716  * the fused ops below they assume accumulate both from and into Vd.
1717  */
1718 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1719 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1720 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1721 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1722 
1723 #undef DO_FMUL_IDX
1724 
1725 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF)                             \
1726 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1727                   float_status *stat, uint32_t desc)                       \
1728 {                                                                          \
1729     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1730     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1731     intptr_t idx = simd_data(desc);                                        \
1732     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1733     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1734         TYPE mm = m[H(i + idx)];                                           \
1735         for (j = 0; j < segment; j++) {                                    \
1736             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1737                                      a[i + j], NEGF, stat);                \
1738         }                                                                  \
1739     }                                                                      \
1740     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1741 }
1742 
1743 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1744 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1745 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1746 
1747 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1748 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1749 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1750 
1751 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1752 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1753 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1754 
1755 #undef DO_FMLA_IDX
1756 
1757 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1758 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1759 {                                                                          \
1760     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1761     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1762     bool q = false;                                                        \
1763     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1764         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1765         if (dd < MIN) {                                                    \
1766             dd = MIN;                                                      \
1767             q = true;                                                      \
1768         } else if (dd > MAX) {                                             \
1769             dd = MAX;                                                      \
1770             q = true;                                                      \
1771         }                                                                  \
1772         d[i] = dd;                                                         \
1773     }                                                                      \
1774     if (q) {                                                               \
1775         uint32_t *qc = vq;                                                 \
1776         qc[0] = 1;                                                         \
1777     }                                                                      \
1778     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1779 }
1780 
1781 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1782 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1783 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1784 
1785 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1786 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1787 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1788 
1789 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1790 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1791 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1792 
1793 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1794 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1795 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1796 
1797 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1798 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1799 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1800 
1801 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1802 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1803 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1804 
1805 #undef DO_SAT
1806 
1807 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1808                           void *vm, uint32_t desc)
1809 {
1810     intptr_t i, oprsz = simd_oprsz(desc);
1811     uint64_t *d = vd, *n = vn, *m = vm;
1812     bool q = false;
1813 
1814     for (i = 0; i < oprsz / 8; i++) {
1815         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1816         if (dd < nn) {
1817             dd = UINT64_MAX;
1818             q = true;
1819         }
1820         d[i] = dd;
1821     }
1822     if (q) {
1823         uint32_t *qc = vq;
1824         qc[0] = 1;
1825     }
1826     clear_tail(d, oprsz, simd_maxsz(desc));
1827 }
1828 
1829 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1830                           void *vm, uint32_t desc)
1831 {
1832     intptr_t i, oprsz = simd_oprsz(desc);
1833     uint64_t *d = vd, *n = vn, *m = vm;
1834     bool q = false;
1835 
1836     for (i = 0; i < oprsz / 8; i++) {
1837         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1838         if (nn < mm) {
1839             dd = 0;
1840             q = true;
1841         }
1842         d[i] = dd;
1843     }
1844     if (q) {
1845         uint32_t *qc = vq;
1846         qc[0] = 1;
1847     }
1848     clear_tail(d, oprsz, simd_maxsz(desc));
1849 }
1850 
1851 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1852                           void *vm, uint32_t desc)
1853 {
1854     intptr_t i, oprsz = simd_oprsz(desc);
1855     int64_t *d = vd, *n = vn, *m = vm;
1856     bool q = false;
1857 
1858     for (i = 0; i < oprsz / 8; i++) {
1859         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1860         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1861             dd = (nn >> 63) ^ ~INT64_MIN;
1862             q = true;
1863         }
1864         d[i] = dd;
1865     }
1866     if (q) {
1867         uint32_t *qc = vq;
1868         qc[0] = 1;
1869     }
1870     clear_tail(d, oprsz, simd_maxsz(desc));
1871 }
1872 
1873 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1874                           void *vm, uint32_t desc)
1875 {
1876     intptr_t i, oprsz = simd_oprsz(desc);
1877     int64_t *d = vd, *n = vn, *m = vm;
1878     bool q = false;
1879 
1880     for (i = 0; i < oprsz / 8; i++) {
1881         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1882         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1883             dd = (nn >> 63) ^ ~INT64_MIN;
1884             q = true;
1885         }
1886         d[i] = dd;
1887     }
1888     if (q) {
1889         uint32_t *qc = vq;
1890         qc[0] = 1;
1891     }
1892     clear_tail(d, oprsz, simd_maxsz(desc));
1893 }
1894 
1895 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1896                            void *vm, uint32_t desc)
1897 {
1898     intptr_t i, oprsz = simd_oprsz(desc);
1899     uint64_t *d = vd, *n = vn, *m = vm;
1900     bool q = false;
1901 
1902     for (i = 0; i < oprsz / 8; i++) {
1903         uint64_t nn = n[i];
1904         int64_t mm = m[i];
1905         uint64_t dd = nn + mm;
1906 
1907         if (mm < 0) {
1908             if (nn < (uint64_t)-mm) {
1909                 dd = 0;
1910                 q = true;
1911             }
1912         } else {
1913             if (dd < nn) {
1914                 dd = UINT64_MAX;
1915                 q = true;
1916             }
1917         }
1918         d[i] = dd;
1919     }
1920     if (q) {
1921         uint32_t *qc = vq;
1922         qc[0] = 1;
1923     }
1924     clear_tail(d, oprsz, simd_maxsz(desc));
1925 }
1926 
1927 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1928                            void *vm, uint32_t desc)
1929 {
1930     intptr_t i, oprsz = simd_oprsz(desc);
1931     uint64_t *d = vd, *n = vn, *m = vm;
1932     bool q = false;
1933 
1934     for (i = 0; i < oprsz / 8; i++) {
1935         int64_t nn = n[i];
1936         uint64_t mm = m[i];
1937         int64_t dd = nn + mm;
1938 
1939         if (mm > (uint64_t)(INT64_MAX - nn)) {
1940             dd = INT64_MAX;
1941             q = true;
1942         }
1943         d[i] = dd;
1944     }
1945     if (q) {
1946         uint32_t *qc = vq;
1947         qc[0] = 1;
1948     }
1949     clear_tail(d, oprsz, simd_maxsz(desc));
1950 }
1951 
1952 #define DO_SRA(NAME, TYPE)                              \
1953 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1954 {                                                       \
1955     intptr_t i, oprsz = simd_oprsz(desc);               \
1956     int shift = simd_data(desc);                        \
1957     TYPE *d = vd, *n = vn;                              \
1958     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1959         d[i] += n[i] >> shift;                          \
1960     }                                                   \
1961     clear_tail(d, oprsz, simd_maxsz(desc));             \
1962 }
1963 
1964 DO_SRA(gvec_ssra_b, int8_t)
1965 DO_SRA(gvec_ssra_h, int16_t)
1966 DO_SRA(gvec_ssra_s, int32_t)
1967 DO_SRA(gvec_ssra_d, int64_t)
1968 
1969 DO_SRA(gvec_usra_b, uint8_t)
1970 DO_SRA(gvec_usra_h, uint16_t)
1971 DO_SRA(gvec_usra_s, uint32_t)
1972 DO_SRA(gvec_usra_d, uint64_t)
1973 
1974 #undef DO_SRA
1975 
1976 #define DO_RSHR(NAME, TYPE)                             \
1977 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1978 {                                                       \
1979     intptr_t i, oprsz = simd_oprsz(desc);               \
1980     int shift = simd_data(desc);                        \
1981     TYPE *d = vd, *n = vn;                              \
1982     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1983         TYPE tmp = n[i] >> (shift - 1);                 \
1984         d[i] = (tmp >> 1) + (tmp & 1);                  \
1985     }                                                   \
1986     clear_tail(d, oprsz, simd_maxsz(desc));             \
1987 }
1988 
1989 DO_RSHR(gvec_srshr_b, int8_t)
1990 DO_RSHR(gvec_srshr_h, int16_t)
1991 DO_RSHR(gvec_srshr_s, int32_t)
1992 DO_RSHR(gvec_srshr_d, int64_t)
1993 
1994 DO_RSHR(gvec_urshr_b, uint8_t)
1995 DO_RSHR(gvec_urshr_h, uint16_t)
1996 DO_RSHR(gvec_urshr_s, uint32_t)
1997 DO_RSHR(gvec_urshr_d, uint64_t)
1998 
1999 #undef DO_RSHR
2000 
2001 #define DO_RSRA(NAME, TYPE)                             \
2002 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2003 {                                                       \
2004     intptr_t i, oprsz = simd_oprsz(desc);               \
2005     int shift = simd_data(desc);                        \
2006     TYPE *d = vd, *n = vn;                              \
2007     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2008         TYPE tmp = n[i] >> (shift - 1);                 \
2009         d[i] += (tmp >> 1) + (tmp & 1);                 \
2010     }                                                   \
2011     clear_tail(d, oprsz, simd_maxsz(desc));             \
2012 }
2013 
2014 DO_RSRA(gvec_srsra_b, int8_t)
2015 DO_RSRA(gvec_srsra_h, int16_t)
2016 DO_RSRA(gvec_srsra_s, int32_t)
2017 DO_RSRA(gvec_srsra_d, int64_t)
2018 
2019 DO_RSRA(gvec_ursra_b, uint8_t)
2020 DO_RSRA(gvec_ursra_h, uint16_t)
2021 DO_RSRA(gvec_ursra_s, uint32_t)
2022 DO_RSRA(gvec_ursra_d, uint64_t)
2023 
2024 #undef DO_RSRA
2025 
2026 #define DO_SRI(NAME, TYPE)                              \
2027 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2028 {                                                       \
2029     intptr_t i, oprsz = simd_oprsz(desc);               \
2030     int shift = simd_data(desc);                        \
2031     TYPE *d = vd, *n = vn;                              \
2032     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2033         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2034     }                                                   \
2035     clear_tail(d, oprsz, simd_maxsz(desc));             \
2036 }
2037 
2038 DO_SRI(gvec_sri_b, uint8_t)
2039 DO_SRI(gvec_sri_h, uint16_t)
2040 DO_SRI(gvec_sri_s, uint32_t)
2041 DO_SRI(gvec_sri_d, uint64_t)
2042 
2043 #undef DO_SRI
2044 
2045 #define DO_SLI(NAME, TYPE)                              \
2046 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2047 {                                                       \
2048     intptr_t i, oprsz = simd_oprsz(desc);               \
2049     int shift = simd_data(desc);                        \
2050     TYPE *d = vd, *n = vn;                              \
2051     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2052         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2053     }                                                   \
2054     clear_tail(d, oprsz, simd_maxsz(desc));             \
2055 }
2056 
2057 DO_SLI(gvec_sli_b, uint8_t)
2058 DO_SLI(gvec_sli_h, uint16_t)
2059 DO_SLI(gvec_sli_s, uint32_t)
2060 DO_SLI(gvec_sli_d, uint64_t)
2061 
2062 #undef DO_SLI
2063 
2064 /*
2065  * Convert float16 to float32, raising no exceptions and
2066  * preserving exceptional values, including SNaN.
2067  * This is effectively an unpack+repack operation.
2068  */
2069 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2070 {
2071     const int f16_bias = 15;
2072     const int f32_bias = 127;
2073     uint32_t sign = extract32(f16, 15, 1);
2074     uint32_t exp = extract32(f16, 10, 5);
2075     uint32_t frac = extract32(f16, 0, 10);
2076 
2077     if (exp == 0x1f) {
2078         /* Inf or NaN */
2079         exp = 0xff;
2080     } else if (exp == 0) {
2081         /* Zero or denormal.  */
2082         if (frac != 0) {
2083             if (fz16) {
2084                 frac = 0;
2085             } else {
2086                 /*
2087                  * Denormal; these are all normal float32.
2088                  * Shift the fraction so that the msb is at bit 11,
2089                  * then remove bit 11 as the implicit bit of the
2090                  * normalized float32.  Note that we still go through
2091                  * the shift for normal numbers below, to put the
2092                  * float32 fraction at the right place.
2093                  */
2094                 int shift = clz32(frac) - 21;
2095                 frac = (frac << shift) & 0x3ff;
2096                 exp = f32_bias - f16_bias - shift + 1;
2097             }
2098         }
2099     } else {
2100         /* Normal number; adjust the bias.  */
2101         exp += f32_bias - f16_bias;
2102     }
2103     sign <<= 31;
2104     exp <<= 23;
2105     frac <<= 23 - 10;
2106 
2107     return sign | exp | frac;
2108 }
2109 
2110 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2111 {
2112     /*
2113      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2114      * Load the 2nd qword iff is_q & is_2.
2115      * Shift to the 2nd dword iff !is_q & is_2.
2116      * For !is_q & !is_2, the upper bits of the result are garbage.
2117      */
2118     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2119 }
2120 
2121 /*
2122  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2123  * as there is not yet SVE versions that might use blocking.
2124  */
2125 
2126 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2127                      uint32_t desc, bool fz16)
2128 {
2129     intptr_t i, oprsz = simd_oprsz(desc);
2130     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2131     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2132     int is_q = oprsz == 16;
2133     uint64_t n_4, m_4;
2134 
2135     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2136     n_4 = load4_f16(vn, is_q, is_2);
2137     m_4 = load4_f16(vm, is_q, is_2);
2138 
2139     /* Negate all inputs for FMLSL at once.  */
2140     if (is_s) {
2141         n_4 ^= 0x8000800080008000ull;
2142     }
2143 
2144     for (i = 0; i < oprsz / 4; i++) {
2145         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2146         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2147         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2148     }
2149     clear_tail(d, oprsz, simd_maxsz(desc));
2150 }
2151 
2152 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2153                             CPUARMState *env, uint32_t desc)
2154 {
2155     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2156              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2157 }
2158 
2159 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2160                             CPUARMState *env, uint32_t desc)
2161 {
2162     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2163              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2164 }
2165 
2166 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2167                                CPUARMState *env, uint32_t desc)
2168 {
2169     intptr_t i, oprsz = simd_oprsz(desc);
2170     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2171     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2172     float_status *status = &env->vfp.fp_status_a64;
2173     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2174 
2175     for (i = 0; i < oprsz; i += sizeof(float32)) {
2176         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2177         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2178         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2179         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2180         float32 aa = *(float32 *)(va + H1_4(i));
2181 
2182         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2183     }
2184 }
2185 
2186 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2187                          uint32_t desc, bool fz16)
2188 {
2189     intptr_t i, oprsz = simd_oprsz(desc);
2190     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2191     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2192     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2193     int is_q = oprsz == 16;
2194     uint64_t n_4;
2195     float32 m_1;
2196 
2197     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2198     n_4 = load4_f16(vn, is_q, is_2);
2199 
2200     /* Negate all inputs for FMLSL at once.  */
2201     if (is_s) {
2202         n_4 ^= 0x8000800080008000ull;
2203     }
2204 
2205     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2206 
2207     for (i = 0; i < oprsz / 4; i++) {
2208         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2209         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2210     }
2211     clear_tail(d, oprsz, simd_maxsz(desc));
2212 }
2213 
2214 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2215                                 CPUARMState *env, uint32_t desc)
2216 {
2217     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2218                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2219 }
2220 
2221 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2222                                 CPUARMState *env, uint32_t desc)
2223 {
2224     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2225                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2226 }
2227 
2228 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2229                                CPUARMState *env, uint32_t desc)
2230 {
2231     intptr_t i, j, oprsz = simd_oprsz(desc);
2232     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2233     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2234     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2235     float_status *status = &env->vfp.fp_status_a64;
2236     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2237 
2238     for (i = 0; i < oprsz; i += 16) {
2239         float16 mm_16 = *(float16 *)(vm + i + idx);
2240         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2241 
2242         for (j = 0; j < 16; j += sizeof(float32)) {
2243             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2244             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2245             float32 aa = *(float32 *)(va + H1_4(i + j));
2246 
2247             *(float32 *)(vd + H1_4(i + j)) =
2248                 float32_muladd(nn, mm, aa, 0, status);
2249         }
2250     }
2251 }
2252 
2253 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2254 {
2255     intptr_t i, opr_sz = simd_oprsz(desc);
2256     int8_t *d = vd, *n = vn, *m = vm;
2257 
2258     for (i = 0; i < opr_sz; ++i) {
2259         int8_t mm = m[i];
2260         int8_t nn = n[i];
2261         int8_t res = 0;
2262         if (mm >= 0) {
2263             if (mm < 8) {
2264                 res = nn << mm;
2265             }
2266         } else {
2267             res = nn >> (mm > -8 ? -mm : 7);
2268         }
2269         d[i] = res;
2270     }
2271     clear_tail(d, opr_sz, simd_maxsz(desc));
2272 }
2273 
2274 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2275 {
2276     intptr_t i, opr_sz = simd_oprsz(desc);
2277     int16_t *d = vd, *n = vn, *m = vm;
2278 
2279     for (i = 0; i < opr_sz / 2; ++i) {
2280         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2281         int16_t nn = n[i];
2282         int16_t res = 0;
2283         if (mm >= 0) {
2284             if (mm < 16) {
2285                 res = nn << mm;
2286             }
2287         } else {
2288             res = nn >> (mm > -16 ? -mm : 15);
2289         }
2290         d[i] = res;
2291     }
2292     clear_tail(d, opr_sz, simd_maxsz(desc));
2293 }
2294 
2295 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2296 {
2297     intptr_t i, opr_sz = simd_oprsz(desc);
2298     uint8_t *d = vd, *n = vn, *m = vm;
2299 
2300     for (i = 0; i < opr_sz; ++i) {
2301         int8_t mm = m[i];
2302         uint8_t nn = n[i];
2303         uint8_t res = 0;
2304         if (mm >= 0) {
2305             if (mm < 8) {
2306                 res = nn << mm;
2307             }
2308         } else {
2309             if (mm > -8) {
2310                 res = nn >> -mm;
2311             }
2312         }
2313         d[i] = res;
2314     }
2315     clear_tail(d, opr_sz, simd_maxsz(desc));
2316 }
2317 
2318 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2319 {
2320     intptr_t i, opr_sz = simd_oprsz(desc);
2321     uint16_t *d = vd, *n = vn, *m = vm;
2322 
2323     for (i = 0; i < opr_sz / 2; ++i) {
2324         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2325         uint16_t nn = n[i];
2326         uint16_t res = 0;
2327         if (mm >= 0) {
2328             if (mm < 16) {
2329                 res = nn << mm;
2330             }
2331         } else {
2332             if (mm > -16) {
2333                 res = nn >> -mm;
2334             }
2335         }
2336         d[i] = res;
2337     }
2338     clear_tail(d, opr_sz, simd_maxsz(desc));
2339 }
2340 
2341 /*
2342  * 8x8->8 polynomial multiply.
2343  *
2344  * Polynomial multiplication is like integer multiplication except the
2345  * partial products are XORed, not added.
2346  *
2347  * TODO: expose this as a generic vector operation, as it is a common
2348  * crypto building block.
2349  */
2350 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2351 {
2352     intptr_t i, opr_sz = simd_oprsz(desc);
2353     uint64_t *d = vd, *n = vn, *m = vm;
2354 
2355     for (i = 0; i < opr_sz / 8; ++i) {
2356         d[i] = clmul_8x8_low(n[i], m[i]);
2357     }
2358     clear_tail(d, opr_sz, simd_maxsz(desc));
2359 }
2360 
2361 /*
2362  * 64x64->128 polynomial multiply.
2363  * Because of the lanes are not accessed in strict columns,
2364  * this probably cannot be turned into a generic helper.
2365  */
2366 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2367 {
2368     intptr_t i, opr_sz = simd_oprsz(desc);
2369     intptr_t hi = simd_data(desc);
2370     uint64_t *d = vd, *n = vn, *m = vm;
2371 
2372     for (i = 0; i < opr_sz / 8; i += 2) {
2373         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2374         d[i] = int128_getlo(r);
2375         d[i + 1] = int128_gethi(r);
2376     }
2377     clear_tail(d, opr_sz, simd_maxsz(desc));
2378 }
2379 
2380 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2381 {
2382     int hi = simd_data(desc);
2383     uint64_t *d = vd, *n = vn, *m = vm;
2384     uint64_t nn = n[hi], mm = m[hi];
2385 
2386     d[0] = clmul_8x4_packed(nn, mm);
2387     nn >>= 32;
2388     mm >>= 32;
2389     d[1] = clmul_8x4_packed(nn, mm);
2390 
2391     clear_tail(d, 16, simd_maxsz(desc));
2392 }
2393 
2394 #ifdef TARGET_AARCH64
2395 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2396 {
2397     int shift = simd_data(desc) * 8;
2398     intptr_t i, opr_sz = simd_oprsz(desc);
2399     uint64_t *d = vd, *n = vn, *m = vm;
2400 
2401     for (i = 0; i < opr_sz / 8; ++i) {
2402         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2403     }
2404 }
2405 
2406 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2407 {
2408     intptr_t sel = H4(simd_data(desc));
2409     intptr_t i, opr_sz = simd_oprsz(desc);
2410     uint32_t *n = vn, *m = vm;
2411     uint64_t *d = vd;
2412 
2413     for (i = 0; i < opr_sz / 8; ++i) {
2414         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2415     }
2416 }
2417 #endif
2418 
2419 #define DO_CMP0(NAME, TYPE, OP)                         \
2420 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2421 {                                                       \
2422     intptr_t i, opr_sz = simd_oprsz(desc);              \
2423     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2424         TYPE nn = *(TYPE *)(vn + i);                    \
2425         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2426     }                                                   \
2427     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2428 }
2429 
2430 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2431 DO_CMP0(gvec_clt0_b, int8_t, <)
2432 DO_CMP0(gvec_cle0_b, int8_t, <=)
2433 DO_CMP0(gvec_cgt0_b, int8_t, >)
2434 DO_CMP0(gvec_cge0_b, int8_t, >=)
2435 
2436 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2437 DO_CMP0(gvec_clt0_h, int16_t, <)
2438 DO_CMP0(gvec_cle0_h, int16_t, <=)
2439 DO_CMP0(gvec_cgt0_h, int16_t, >)
2440 DO_CMP0(gvec_cge0_h, int16_t, >=)
2441 
2442 #undef DO_CMP0
2443 
2444 #define DO_ABD(NAME, TYPE)                                      \
2445 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2446 {                                                               \
2447     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2448     TYPE *d = vd, *n = vn, *m = vm;                             \
2449                                                                 \
2450     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2451         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2452     }                                                           \
2453     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2454 }
2455 
2456 DO_ABD(gvec_sabd_b, int8_t)
2457 DO_ABD(gvec_sabd_h, int16_t)
2458 DO_ABD(gvec_sabd_s, int32_t)
2459 DO_ABD(gvec_sabd_d, int64_t)
2460 
2461 DO_ABD(gvec_uabd_b, uint8_t)
2462 DO_ABD(gvec_uabd_h, uint16_t)
2463 DO_ABD(gvec_uabd_s, uint32_t)
2464 DO_ABD(gvec_uabd_d, uint64_t)
2465 
2466 #undef DO_ABD
2467 
2468 #define DO_ABA(NAME, TYPE)                                      \
2469 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2470 {                                                               \
2471     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2472     TYPE *d = vd, *n = vn, *m = vm;                             \
2473                                                                 \
2474     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2475         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2476     }                                                           \
2477     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2478 }
2479 
2480 DO_ABA(gvec_saba_b, int8_t)
2481 DO_ABA(gvec_saba_h, int16_t)
2482 DO_ABA(gvec_saba_s, int32_t)
2483 DO_ABA(gvec_saba_d, int64_t)
2484 
2485 DO_ABA(gvec_uaba_b, uint8_t)
2486 DO_ABA(gvec_uaba_h, uint16_t)
2487 DO_ABA(gvec_uaba_s, uint32_t)
2488 DO_ABA(gvec_uaba_d, uint64_t)
2489 
2490 #undef DO_ABA
2491 
2492 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2493 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2494                   float_status *stat, uint32_t desc)                       \
2495 {                                                                          \
2496     ARMVectorReg scratch;                                                  \
2497     intptr_t oprsz = simd_oprsz(desc);                                     \
2498     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2499     TYPE *d = vd, *n = vn, *m = vm;                                        \
2500     if (unlikely(d == m)) {                                                \
2501         m = memcpy(&scratch, m, oprsz);                                    \
2502     }                                                                      \
2503     for (intptr_t i = 0; i < half; ++i) {                                  \
2504         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2505     }                                                                      \
2506     for (intptr_t i = 0; i < half; ++i) {                                  \
2507         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2508     }                                                                      \
2509     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2510 }
2511 
2512 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2513 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2514 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2515 
2516 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2517 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2518 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2519 
2520 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2521 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2522 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2523 
2524 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2525 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2526 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2527 
2528 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2529 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2530 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2531 
2532 #ifdef TARGET_AARCH64
2533 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2534 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2535 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2536 
2537 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2538 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2539 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2540 #endif
2541 
2542 #undef DO_3OP_PAIR
2543 
2544 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2545 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2546 {                                                               \
2547     ARMVectorReg scratch;                                       \
2548     intptr_t oprsz = simd_oprsz(desc);                          \
2549     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2550     TYPE *d = vd, *n = vn, *m = vm;                             \
2551     if (unlikely(d == m)) {                                     \
2552         m = memcpy(&scratch, m, oprsz);                         \
2553     }                                                           \
2554     for (intptr_t i = 0; i < half; ++i) {                       \
2555         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2556     }                                                           \
2557     for (intptr_t i = 0; i < half; ++i) {                       \
2558         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2559     }                                                           \
2560     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2561 }
2562 
2563 #define ADD(A, B) (A + B)
2564 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2565 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2566 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2567 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2568 #undef  ADD
2569 
2570 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2571 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2572 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2573 
2574 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2575 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2576 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2577 
2578 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2579 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2580 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2581 
2582 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2583 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2584 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2585 
2586 #undef DO_3OP_PAIR
2587 
2588 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2589     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2590     {                                                                   \
2591         intptr_t i, oprsz = simd_oprsz(desc);                           \
2592         int shift = simd_data(desc);                                    \
2593         TYPE *d = vd, *n = vn;                                          \
2594         float_status *fpst = stat;                                      \
2595         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2596             d[i] = FUNC(n[i], shift, fpst);                             \
2597         }                                                               \
2598         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2599     }
2600 
2601 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2602 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2603 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2604 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2605 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2606 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2607 
2608 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2609 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2610 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2611 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2612 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2613 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2614 
2615 #undef DO_VCVT_FIXED
2616 
2617 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2618     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2619     {                                                                   \
2620         intptr_t i, oprsz = simd_oprsz(desc);                           \
2621         uint32_t rmode = simd_data(desc);                               \
2622         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2623         TYPE *d = vd, *n = vn;                                          \
2624         set_float_rounding_mode(rmode, fpst);                           \
2625         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2626             d[i] = FUNC(n[i], 0, fpst);                                 \
2627         }                                                               \
2628         set_float_rounding_mode(prev_rmode, fpst);                      \
2629         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2630     }
2631 
2632 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2633 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2634 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2635 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2636 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2637 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2638 
2639 #undef DO_VCVT_RMODE
2640 
2641 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2642     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2643     {                                                                   \
2644         intptr_t i, oprsz = simd_oprsz(desc);                           \
2645         uint32_t rmode = simd_data(desc);                               \
2646         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2647         TYPE *d = vd, *n = vn;                                          \
2648         set_float_rounding_mode(rmode, fpst);                           \
2649         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2650             d[i] = FUNC(n[i], fpst);                                    \
2651         }                                                               \
2652         set_float_rounding_mode(prev_rmode, fpst);                      \
2653         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2654     }
2655 
2656 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2657 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2658 
2659 #undef DO_VRINT_RMODE
2660 
2661 #ifdef TARGET_AARCH64
2662 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2663 {
2664     const uint8_t *indices = vm;
2665     size_t oprsz = simd_oprsz(desc);
2666     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2667     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2668     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2669     union {
2670         uint8_t b[16];
2671         uint64_t d[2];
2672     } result;
2673 
2674     /*
2675      * We must construct the final result in a temp, lest the output
2676      * overlaps the input table.  For TBL, begin with zero; for TBX,
2677      * begin with the original register contents.  Note that we always
2678      * copy 16 bytes here to avoid an extra branch; clearing the high
2679      * bits of the register for oprsz == 8 is handled below.
2680      */
2681     if (is_tbx) {
2682         memcpy(&result, vd, 16);
2683     } else {
2684         memset(&result, 0, 16);
2685     }
2686 
2687     for (size_t i = 0; i < oprsz; ++i) {
2688         uint32_t index = indices[H1(i)];
2689 
2690         if (index < table_len) {
2691             /*
2692              * Convert index (a byte offset into the virtual table
2693              * which is a series of 128-bit vectors concatenated)
2694              * into the correct register element, bearing in mind
2695              * that the table can wrap around from V31 to V0.
2696              */
2697             const uint8_t *table = (const uint8_t *)
2698                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2699             result.b[H1(i)] = table[H1(index % 16)];
2700         }
2701     }
2702 
2703     memcpy(vd, &result, 16);
2704     clear_tail(vd, oprsz, simd_maxsz(desc));
2705 }
2706 #endif
2707 
2708 /*
2709  * NxN -> N highpart multiply
2710  *
2711  * TODO: expose this as a generic vector operation.
2712  */
2713 
2714 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2715 {
2716     intptr_t i, opr_sz = simd_oprsz(desc);
2717     int8_t *d = vd, *n = vn, *m = vm;
2718 
2719     for (i = 0; i < opr_sz; ++i) {
2720         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2721     }
2722     clear_tail(d, opr_sz, simd_maxsz(desc));
2723 }
2724 
2725 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2726 {
2727     intptr_t i, opr_sz = simd_oprsz(desc);
2728     int16_t *d = vd, *n = vn, *m = vm;
2729 
2730     for (i = 0; i < opr_sz / 2; ++i) {
2731         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2732     }
2733     clear_tail(d, opr_sz, simd_maxsz(desc));
2734 }
2735 
2736 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2737 {
2738     intptr_t i, opr_sz = simd_oprsz(desc);
2739     int32_t *d = vd, *n = vn, *m = vm;
2740 
2741     for (i = 0; i < opr_sz / 4; ++i) {
2742         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2743     }
2744     clear_tail(d, opr_sz, simd_maxsz(desc));
2745 }
2746 
2747 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2748 {
2749     intptr_t i, opr_sz = simd_oprsz(desc);
2750     uint64_t *d = vd, *n = vn, *m = vm;
2751     uint64_t discard;
2752 
2753     for (i = 0; i < opr_sz / 8; ++i) {
2754         muls64(&discard, &d[i], n[i], m[i]);
2755     }
2756     clear_tail(d, opr_sz, simd_maxsz(desc));
2757 }
2758 
2759 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2760 {
2761     intptr_t i, opr_sz = simd_oprsz(desc);
2762     uint8_t *d = vd, *n = vn, *m = vm;
2763 
2764     for (i = 0; i < opr_sz; ++i) {
2765         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2766     }
2767     clear_tail(d, opr_sz, simd_maxsz(desc));
2768 }
2769 
2770 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2771 {
2772     intptr_t i, opr_sz = simd_oprsz(desc);
2773     uint16_t *d = vd, *n = vn, *m = vm;
2774 
2775     for (i = 0; i < opr_sz / 2; ++i) {
2776         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2777     }
2778     clear_tail(d, opr_sz, simd_maxsz(desc));
2779 }
2780 
2781 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2782 {
2783     intptr_t i, opr_sz = simd_oprsz(desc);
2784     uint32_t *d = vd, *n = vn, *m = vm;
2785 
2786     for (i = 0; i < opr_sz / 4; ++i) {
2787         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2788     }
2789     clear_tail(d, opr_sz, simd_maxsz(desc));
2790 }
2791 
2792 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2793 {
2794     intptr_t i, opr_sz = simd_oprsz(desc);
2795     uint64_t *d = vd, *n = vn, *m = vm;
2796     uint64_t discard;
2797 
2798     for (i = 0; i < opr_sz / 8; ++i) {
2799         mulu64(&discard, &d[i], n[i], m[i]);
2800     }
2801     clear_tail(d, opr_sz, simd_maxsz(desc));
2802 }
2803 
2804 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2805 {
2806     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2807     int shr = simd_data(desc);
2808     uint64_t *d = vd, *n = vn, *m = vm;
2809 
2810     for (i = 0; i < opr_sz; ++i) {
2811         d[i] = ror64(n[i] ^ m[i], shr);
2812     }
2813     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2814 }
2815 
2816 /*
2817  * Integer matrix-multiply accumulate
2818  */
2819 
2820 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2821 {
2822     int8_t *n = vn, *m = vm;
2823 
2824     for (intptr_t k = 0; k < 8; ++k) {
2825         sum += n[H1(k)] * m[H1(k)];
2826     }
2827     return sum;
2828 }
2829 
2830 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2831 {
2832     uint8_t *n = vn, *m = vm;
2833 
2834     for (intptr_t k = 0; k < 8; ++k) {
2835         sum += n[H1(k)] * m[H1(k)];
2836     }
2837     return sum;
2838 }
2839 
2840 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2841 {
2842     uint8_t *n = vn;
2843     int8_t *m = vm;
2844 
2845     for (intptr_t k = 0; k < 8; ++k) {
2846         sum += n[H1(k)] * m[H1(k)];
2847     }
2848     return sum;
2849 }
2850 
2851 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2852                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2853 {
2854     intptr_t seg, opr_sz = simd_oprsz(desc);
2855 
2856     for (seg = 0; seg < opr_sz; seg += 16) {
2857         uint32_t *d = vd + seg;
2858         uint32_t *a = va + seg;
2859         uint32_t sum0, sum1, sum2, sum3;
2860 
2861         /*
2862          * Process the entire segment at once, writing back the
2863          * results only after we've consumed all of the inputs.
2864          *
2865          * Key to indices by column:
2866          *          i   j                  i             j
2867          */
2868         sum0 = a[H4(0 + 0)];
2869         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2870         sum1 = a[H4(0 + 1)];
2871         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2872         sum2 = a[H4(2 + 0)];
2873         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2874         sum3 = a[H4(2 + 1)];
2875         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2876 
2877         d[H4(0)] = sum0;
2878         d[H4(1)] = sum1;
2879         d[H4(2)] = sum2;
2880         d[H4(3)] = sum3;
2881     }
2882     clear_tail(vd, opr_sz, simd_maxsz(desc));
2883 }
2884 
2885 #define DO_MMLA_B(NAME, INNER) \
2886     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2887     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2888 
2889 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2890 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2891 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2892 
2893 /*
2894  * BFloat16 Dot Product
2895  */
2896 
2897 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2898 {
2899     /*
2900      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2901      * For EBF = 0, we ignore the FPCR bits which determine rounding
2902      * mode and denormal-flushing, and we do unfused multiplies and
2903      * additions with intermediate rounding of all products and sums.
2904      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2905      * and we perform a fused two-way sum-of-products without intermediate
2906      * rounding of the products.
2907      * In either case, we don't set fp exception flags.
2908      *
2909      * EBF is AArch64 only, so even if it's set in the FPCR it has
2910      * no effect on AArch32 instructions.
2911      */
2912     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2913 
2914     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2915     set_default_nan_mode(true, statusp);
2916 
2917     if (ebf) {
2918         /* EBF=1 needs to do a step with round-to-odd semantics */
2919         *oddstatusp = *statusp;
2920         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2921     } else {
2922         set_flush_to_zero(true, statusp);
2923         set_flush_inputs_to_zero(true, statusp);
2924         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2925     }
2926     return ebf;
2927 }
2928 
2929 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2930 {
2931     float32 t1, t2;
2932 
2933     /*
2934      * Extract each BFloat16 from the element pair, and shift
2935      * them such that they become float32.
2936      */
2937     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2938     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2939     t1 = float32_add(t1, t2, fpst);
2940     t1 = float32_add(sum, t1, fpst);
2941 
2942     return t1;
2943 }
2944 
2945 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2946                      float_status *fpst, float_status *fpst_odd)
2947 {
2948     /*
2949      * Compare f16_dotadd() in sme_helper.c, but here we have
2950      * bfloat16 inputs. In particular that means that we do not
2951      * want the FPCR.FZ16 flush semantics, so we use the normal
2952      * float_status for the input handling here.
2953      */
2954     float64 e1r = float32_to_float64(e1 << 16, fpst);
2955     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2956     float64 e2r = float32_to_float64(e2 << 16, fpst);
2957     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2958     float64 t64;
2959     float32 t32;
2960 
2961     /*
2962      * The ARM pseudocode function FPDot performs both multiplies
2963      * and the add with a single rounding operation.  Emulate this
2964      * by performing the first multiply in round-to-odd, then doing
2965      * the second multiply as fused multiply-add, and rounding to
2966      * float32 all in one step.
2967      */
2968     t64 = float64_mul(e1r, e2r, fpst_odd);
2969     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2970 
2971     /* This conversion is exact, because we've already rounded. */
2972     t32 = float64_to_float32(t64, fpst);
2973 
2974     /* The final accumulation step is not fused. */
2975     return float32_add(sum, t32, fpst);
2976 }
2977 
2978 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2979                         CPUARMState *env, uint32_t desc)
2980 {
2981     intptr_t i, opr_sz = simd_oprsz(desc);
2982     float32 *d = vd, *a = va;
2983     uint32_t *n = vn, *m = vm;
2984     float_status fpst, fpst_odd;
2985 
2986     if (is_ebf(env, &fpst, &fpst_odd)) {
2987         for (i = 0; i < opr_sz / 4; ++i) {
2988             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2989         }
2990     } else {
2991         for (i = 0; i < opr_sz / 4; ++i) {
2992             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2993         }
2994     }
2995     clear_tail(d, opr_sz, simd_maxsz(desc));
2996 }
2997 
2998 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2999                             void *va, CPUARMState *env, uint32_t desc)
3000 {
3001     intptr_t i, j, opr_sz = simd_oprsz(desc);
3002     intptr_t index = simd_data(desc);
3003     intptr_t elements = opr_sz / 4;
3004     intptr_t eltspersegment = MIN(16 / 4, elements);
3005     float32 *d = vd, *a = va;
3006     uint32_t *n = vn, *m = vm;
3007     float_status fpst, fpst_odd;
3008 
3009     if (is_ebf(env, &fpst, &fpst_odd)) {
3010         for (i = 0; i < elements; i += eltspersegment) {
3011             uint32_t m_idx = m[i + H4(index)];
3012 
3013             for (j = i; j < i + eltspersegment; j++) {
3014                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3015             }
3016         }
3017     } else {
3018         for (i = 0; i < elements; i += eltspersegment) {
3019             uint32_t m_idx = m[i + H4(index)];
3020 
3021             for (j = i; j < i + eltspersegment; j++) {
3022                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3023             }
3024         }
3025     }
3026     clear_tail(d, opr_sz, simd_maxsz(desc));
3027 }
3028 
3029 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3030                          CPUARMState *env, uint32_t desc)
3031 {
3032     intptr_t s, opr_sz = simd_oprsz(desc);
3033     float32 *d = vd, *a = va;
3034     uint32_t *n = vn, *m = vm;
3035     float_status fpst, fpst_odd;
3036 
3037     if (is_ebf(env, &fpst, &fpst_odd)) {
3038         for (s = 0; s < opr_sz / 4; s += 4) {
3039             float32 sum00, sum01, sum10, sum11;
3040 
3041             /*
3042              * Process the entire segment at once, writing back the
3043              * results only after we've consumed all of the inputs.
3044              *
3045              * Key to indices by column:
3046              *               i   j               i   k             j   k
3047              */
3048             sum00 = a[s + H4(0 + 0)];
3049             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3050             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3051 
3052             sum01 = a[s + H4(0 + 1)];
3053             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3054             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3055 
3056             sum10 = a[s + H4(2 + 0)];
3057             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3058             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3059 
3060             sum11 = a[s + H4(2 + 1)];
3061             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3062             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3063 
3064             d[s + H4(0 + 0)] = sum00;
3065             d[s + H4(0 + 1)] = sum01;
3066             d[s + H4(2 + 0)] = sum10;
3067             d[s + H4(2 + 1)] = sum11;
3068         }
3069     } else {
3070         for (s = 0; s < opr_sz / 4; s += 4) {
3071             float32 sum00, sum01, sum10, sum11;
3072 
3073             /*
3074              * Process the entire segment at once, writing back the
3075              * results only after we've consumed all of the inputs.
3076              *
3077              * Key to indices by column:
3078              *               i   j           i   k             j   k
3079              */
3080             sum00 = a[s + H4(0 + 0)];
3081             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3082             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3083 
3084             sum01 = a[s + H4(0 + 1)];
3085             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3086             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3087 
3088             sum10 = a[s + H4(2 + 0)];
3089             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3090             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3091 
3092             sum11 = a[s + H4(2 + 1)];
3093             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3094             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3095 
3096             d[s + H4(0 + 0)] = sum00;
3097             d[s + H4(0 + 1)] = sum01;
3098             d[s + H4(2 + 0)] = sum10;
3099             d[s + H4(2 + 1)] = sum11;
3100         }
3101     }
3102     clear_tail(d, opr_sz, simd_maxsz(desc));
3103 }
3104 
3105 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3106                          float_status *stat, uint32_t desc)
3107 {
3108     intptr_t i, opr_sz = simd_oprsz(desc);
3109     intptr_t sel = simd_data(desc);
3110     float32 *d = vd, *a = va;
3111     bfloat16 *n = vn, *m = vm;
3112 
3113     for (i = 0; i < opr_sz / 4; ++i) {
3114         float32 nn = n[H2(i * 2 + sel)] << 16;
3115         float32 mm = m[H2(i * 2 + sel)] << 16;
3116         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3117     }
3118     clear_tail(d, opr_sz, simd_maxsz(desc));
3119 }
3120 
3121 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3122                              void *va, float_status *stat, uint32_t desc)
3123 {
3124     intptr_t i, j, opr_sz = simd_oprsz(desc);
3125     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3126     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3127     intptr_t elements = opr_sz / 4;
3128     intptr_t eltspersegment = MIN(16 / 4, elements);
3129     float32 *d = vd, *a = va;
3130     bfloat16 *n = vn, *m = vm;
3131 
3132     for (i = 0; i < elements; i += eltspersegment) {
3133         float32 m_idx = m[H2(2 * i + index)] << 16;
3134 
3135         for (j = i; j < i + eltspersegment; j++) {
3136             float32 n_j = n[H2(2 * j + sel)] << 16;
3137             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3138         }
3139     }
3140     clear_tail(d, opr_sz, simd_maxsz(desc));
3141 }
3142 
3143 #define DO_CLAMP(NAME, TYPE) \
3144 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3145 {                                                                       \
3146     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3147     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3148         TYPE aa = *(TYPE *)(a + i);                                     \
3149         TYPE nn = *(TYPE *)(n + i);                                     \
3150         TYPE mm = *(TYPE *)(m + i);                                     \
3151         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3152         *(TYPE *)(d + i) = dd;                                          \
3153     }                                                                   \
3154     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3155 }
3156 
3157 DO_CLAMP(gvec_sclamp_b, int8_t)
3158 DO_CLAMP(gvec_sclamp_h, int16_t)
3159 DO_CLAMP(gvec_sclamp_s, int32_t)
3160 DO_CLAMP(gvec_sclamp_d, int64_t)
3161 
3162 DO_CLAMP(gvec_uclamp_b, uint8_t)
3163 DO_CLAMP(gvec_uclamp_h, uint16_t)
3164 DO_CLAMP(gvec_uclamp_s, uint32_t)
3165 DO_CLAMP(gvec_uclamp_d, uint64_t)
3166 
3167 /* Bit count in each 8-bit word. */
3168 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3169 {
3170     intptr_t i, opr_sz = simd_oprsz(desc);
3171     uint8_t *d = vd, *n = vn;
3172 
3173     for (i = 0; i < opr_sz; ++i) {
3174         d[i] = ctpop8(n[i]);
3175     }
3176     clear_tail(d, opr_sz, simd_maxsz(desc));
3177 }
3178 
3179 /* Reverse bits in each 8 bit word */
3180 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3181 {
3182     intptr_t i, opr_sz = simd_oprsz(desc);
3183     uint64_t *d = vd, *n = vn;
3184 
3185     for (i = 0; i < opr_sz / 8; ++i) {
3186         d[i] = revbit64(bswap64(n[i]));
3187     }
3188     clear_tail(d, opr_sz, simd_maxsz(desc));
3189 }
3190 
3191 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3192 {
3193     intptr_t i, opr_sz = simd_oprsz(desc);
3194     uint32_t *d = vd, *n = vn;
3195 
3196     for (i = 0; i < opr_sz / 4; ++i) {
3197         d[i] = helper_recpe_u32(n[i]);
3198     }
3199     clear_tail(d, opr_sz, simd_maxsz(desc));
3200 }
3201 
3202 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3203 {
3204     intptr_t i, opr_sz = simd_oprsz(desc);
3205     uint32_t *d = vd, *n = vn;
3206 
3207     for (i = 0; i < opr_sz / 4; ++i) {
3208         d[i] = helper_rsqrte_u32(n[i]);
3209     }
3210     clear_tail(d, opr_sz, simd_maxsz(desc));
3211 }
3212