xref: /qemu/target/arm/tcg/vec_helper.c (revision 384433e709836209ae34bedda8a2a57992be8e18)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
883     uint32_t neg_imag = neg_real ^ 1;
884     uintptr_t i;
885 
886     /* Shift boolean to the sign bit so we can xor to negate.  */
887     neg_real <<= 15;
888     neg_imag <<= 15;
889 
890     for (i = 0; i < opr_sz / 2; i += 2) {
891         float16 e0 = n[H2(i)];
892         float16 e1 = m[H2(i + 1)] ^ neg_imag;
893         float16 e2 = n[H2(i + 1)];
894         float16 e3 = m[H2(i)] ^ neg_real;
895 
896         d[H2(i)] = float16_add(e0, e1, fpst);
897         d[H2(i + 1)] = float16_add(e2, e3, fpst);
898     }
899     clear_tail(d, opr_sz, simd_maxsz(desc));
900 }
901 
902 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
903                          float_status *fpst, uint32_t desc)
904 {
905     uintptr_t opr_sz = simd_oprsz(desc);
906     float32 *d = vd;
907     float32 *n = vn;
908     float32 *m = vm;
909     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
910     uint32_t neg_imag = neg_real ^ 1;
911     uintptr_t i;
912 
913     /* Shift boolean to the sign bit so we can xor to negate.  */
914     neg_real <<= 31;
915     neg_imag <<= 31;
916 
917     for (i = 0; i < opr_sz / 4; i += 2) {
918         float32 e0 = n[H4(i)];
919         float32 e1 = m[H4(i + 1)] ^ neg_imag;
920         float32 e2 = n[H4(i + 1)];
921         float32 e3 = m[H4(i)] ^ neg_real;
922 
923         d[H4(i)] = float32_add(e0, e1, fpst);
924         d[H4(i + 1)] = float32_add(e2, e3, fpst);
925     }
926     clear_tail(d, opr_sz, simd_maxsz(desc));
927 }
928 
929 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
930                          float_status *fpst, uint32_t desc)
931 {
932     uintptr_t opr_sz = simd_oprsz(desc);
933     float64 *d = vd;
934     float64 *n = vn;
935     float64 *m = vm;
936     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
937     uint64_t neg_imag = neg_real ^ 1;
938     uintptr_t i;
939 
940     /* Shift boolean to the sign bit so we can xor to negate.  */
941     neg_real <<= 63;
942     neg_imag <<= 63;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1] ^ neg_imag;
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i] ^ neg_real;
949 
950         d[i] = float64_add(e0, e1, fpst);
951         d[i + 1] = float64_add(e2, e3, fpst);
952     }
953     clear_tail(d, opr_sz, simd_maxsz(desc));
954 }
955 
956 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
957                          float_status *fpst, uint32_t desc)
958 {
959     uintptr_t opr_sz = simd_oprsz(desc);
960     float16 *d = vd, *n = vn, *m = vm, *a = va;
961     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
962     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
963     uint32_t neg_real = flip ^ neg_imag;
964     uintptr_t i;
965 
966     /* Shift boolean to the sign bit so we can xor to negate.  */
967     neg_real <<= 15;
968     neg_imag <<= 15;
969 
970     for (i = 0; i < opr_sz / 2; i += 2) {
971         float16 e2 = n[H2(i + flip)];
972         float16 e1 = m[H2(i + flip)] ^ neg_real;
973         float16 e4 = e2;
974         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
975 
976         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
977         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
978     }
979     clear_tail(d, opr_sz, simd_maxsz(desc));
980 }
981 
982 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
983                              float_status *fpst, uint32_t desc)
984 {
985     uintptr_t opr_sz = simd_oprsz(desc);
986     float16 *d = vd, *n = vn, *m = vm, *a = va;
987     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
988     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
989     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
990     uint32_t neg_real = flip ^ neg_imag;
991     intptr_t elements = opr_sz / sizeof(float16);
992     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
993     intptr_t i, j;
994 
995     /* Shift boolean to the sign bit so we can xor to negate.  */
996     neg_real <<= 15;
997     neg_imag <<= 15;
998 
999     for (i = 0; i < elements; i += eltspersegment) {
1000         float16 mr = m[H2(i + 2 * index + 0)];
1001         float16 mi = m[H2(i + 2 * index + 1)];
1002         float16 e1 = neg_real ^ (flip ? mi : mr);
1003         float16 e3 = neg_imag ^ (flip ? mr : mi);
1004 
1005         for (j = i; j < i + eltspersegment; j += 2) {
1006             float16 e2 = n[H2(j + flip)];
1007             float16 e4 = e2;
1008 
1009             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1010             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1011         }
1012     }
1013     clear_tail(d, opr_sz, simd_maxsz(desc));
1014 }
1015 
1016 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1017                          float_status *fpst, uint32_t desc)
1018 {
1019     uintptr_t opr_sz = simd_oprsz(desc);
1020     float32 *d = vd, *n = vn, *m = vm, *a = va;
1021     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1022     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1023     uint32_t neg_real = flip ^ neg_imag;
1024     uintptr_t i;
1025 
1026     /* Shift boolean to the sign bit so we can xor to negate.  */
1027     neg_real <<= 31;
1028     neg_imag <<= 31;
1029 
1030     for (i = 0; i < opr_sz / 4; i += 2) {
1031         float32 e2 = n[H4(i + flip)];
1032         float32 e1 = m[H4(i + flip)] ^ neg_real;
1033         float32 e4 = e2;
1034         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1035 
1036         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1037         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1038     }
1039     clear_tail(d, opr_sz, simd_maxsz(desc));
1040 }
1041 
1042 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1043                              float_status *fpst, uint32_t desc)
1044 {
1045     uintptr_t opr_sz = simd_oprsz(desc);
1046     float32 *d = vd, *n = vn, *m = vm, *a = va;
1047     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1048     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1049     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1050     uint32_t neg_real = flip ^ neg_imag;
1051     intptr_t elements = opr_sz / sizeof(float32);
1052     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1053     intptr_t i, j;
1054 
1055     /* Shift boolean to the sign bit so we can xor to negate.  */
1056     neg_real <<= 31;
1057     neg_imag <<= 31;
1058 
1059     for (i = 0; i < elements; i += eltspersegment) {
1060         float32 mr = m[H4(i + 2 * index + 0)];
1061         float32 mi = m[H4(i + 2 * index + 1)];
1062         float32 e1 = neg_real ^ (flip ? mi : mr);
1063         float32 e3 = neg_imag ^ (flip ? mr : mi);
1064 
1065         for (j = i; j < i + eltspersegment; j += 2) {
1066             float32 e2 = n[H4(j + flip)];
1067             float32 e4 = e2;
1068 
1069             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1070             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1071         }
1072     }
1073     clear_tail(d, opr_sz, simd_maxsz(desc));
1074 }
1075 
1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1077                          float_status *fpst, uint32_t desc)
1078 {
1079     uintptr_t opr_sz = simd_oprsz(desc);
1080     float64 *d = vd, *n = vn, *m = vm, *a = va;
1081     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1082     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1083     uint64_t neg_real = flip ^ neg_imag;
1084     uintptr_t i;
1085 
1086     /* Shift boolean to the sign bit so we can xor to negate.  */
1087     neg_real <<= 63;
1088     neg_imag <<= 63;
1089 
1090     for (i = 0; i < opr_sz / 8; i += 2) {
1091         float64 e2 = n[i + flip];
1092         float64 e1 = m[i + flip] ^ neg_real;
1093         float64 e4 = e2;
1094         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1095 
1096         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1097         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1098     }
1099     clear_tail(d, opr_sz, simd_maxsz(desc));
1100 }
1101 
1102 /*
1103  * Floating point comparisons producing an integer result (all 1s or all 0s).
1104  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1105  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1106  */
1107 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1108 {
1109     return -float16_eq_quiet(op1, op2, stat);
1110 }
1111 
1112 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1113 {
1114     return -float32_eq_quiet(op1, op2, stat);
1115 }
1116 
1117 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1118 {
1119     return -float64_eq_quiet(op1, op2, stat);
1120 }
1121 
1122 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1123 {
1124     return -float16_le(op2, op1, stat);
1125 }
1126 
1127 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1128 {
1129     return -float32_le(op2, op1, stat);
1130 }
1131 
1132 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1133 {
1134     return -float64_le(op2, op1, stat);
1135 }
1136 
1137 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1138 {
1139     return -float16_lt(op2, op1, stat);
1140 }
1141 
1142 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1143 {
1144     return -float32_lt(op2, op1, stat);
1145 }
1146 
1147 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1148 {
1149     return -float64_lt(op2, op1, stat);
1150 }
1151 
1152 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1153 {
1154     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1155 }
1156 
1157 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1158 {
1159     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1160 }
1161 
1162 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1163 {
1164     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1165 }
1166 
1167 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1168 {
1169     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1170 }
1171 
1172 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1173 {
1174     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1175 }
1176 
1177 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1178 {
1179     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1180 }
1181 
1182 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1183 {
1184     if (float16_is_any_nan(x)) {
1185         float_raise(float_flag_invalid, fpst);
1186         return 0;
1187     }
1188     return float16_to_int16_round_to_zero(x, fpst);
1189 }
1190 
1191 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1192 {
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_uint16_round_to_zero(x, fpst);
1198 }
1199 
1200 #define DO_2OP(NAME, FUNC, TYPE) \
1201 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1202 {                                                                 \
1203     intptr_t i, oprsz = simd_oprsz(desc);                         \
1204     TYPE *d = vd, *n = vn;                                        \
1205     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1206         d[i] = FUNC(n[i], stat);                                  \
1207     }                                                             \
1208     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1209 }
1210 
1211 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1212 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1213 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1214 
1215 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1216 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1217 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1218 
1219 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1220 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1221 
1222 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1223 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1224 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1225 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1226 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1227 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1228 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1229 DO_2OP(gvec_touszh, vfp_touszh, float16)
1230 
1231 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1232     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1233     {                                                           \
1234         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1235     }
1236 
1237 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1238     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1239     {                                                           \
1240         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1241     }
1242 
1243 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1244     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1245     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1246     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1247     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1248     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1249     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1250 
1251 DO_2OP_CMP0(cgt, cgt, FWD)
1252 DO_2OP_CMP0(cge, cge, FWD)
1253 DO_2OP_CMP0(ceq, ceq, FWD)
1254 DO_2OP_CMP0(clt, cgt, REV)
1255 DO_2OP_CMP0(cle, cge, REV)
1256 
1257 #undef DO_2OP
1258 #undef DO_2OP_CMP0
1259 
1260 /* Floating-point trigonometric starting value.
1261  * See the ARM ARM pseudocode function FPTrigSMul.
1262  */
1263 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1264 {
1265     float16 result = float16_mul(op1, op1, stat);
1266     if (!float16_is_any_nan(result)) {
1267         result = float16_set_sign(result, op2 & 1);
1268     }
1269     return result;
1270 }
1271 
1272 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1273 {
1274     float32 result = float32_mul(op1, op1, stat);
1275     if (!float32_is_any_nan(result)) {
1276         result = float32_set_sign(result, op2 & 1);
1277     }
1278     return result;
1279 }
1280 
1281 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1282 {
1283     float64 result = float64_mul(op1, op1, stat);
1284     if (!float64_is_any_nan(result)) {
1285         result = float64_set_sign(result, op2 & 1);
1286     }
1287     return result;
1288 }
1289 
1290 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1291 {
1292     return float16_abs(float16_sub(op1, op2, stat));
1293 }
1294 
1295 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1296 {
1297     return float32_abs(float32_sub(op1, op2, stat));
1298 }
1299 
1300 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1301 {
1302     return float64_abs(float64_sub(op1, op2, stat));
1303 }
1304 
1305 /*
1306  * Reciprocal step. These are the AArch32 version which uses a
1307  * non-fused multiply-and-subtract.
1308  */
1309 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1310 {
1311     op1 = float16_squash_input_denormal(op1, stat);
1312     op2 = float16_squash_input_denormal(op2, stat);
1313 
1314     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1315         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1316         return float16_two;
1317     }
1318     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1319 }
1320 
1321 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1322 {
1323     op1 = float32_squash_input_denormal(op1, stat);
1324     op2 = float32_squash_input_denormal(op2, stat);
1325 
1326     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1327         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1328         return float32_two;
1329     }
1330     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1331 }
1332 
1333 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1334 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1335 {
1336     op1 = float16_squash_input_denormal(op1, stat);
1337     op2 = float16_squash_input_denormal(op2, stat);
1338 
1339     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1340         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1341         return float16_one_point_five;
1342     }
1343     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1344     return float16_div(op1, float16_two, stat);
1345 }
1346 
1347 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1348 {
1349     op1 = float32_squash_input_denormal(op1, stat);
1350     op2 = float32_squash_input_denormal(op2, stat);
1351 
1352     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1353         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1354         return float32_one_point_five;
1355     }
1356     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1357     return float32_div(op1, float32_two, stat);
1358 }
1359 
1360 #define DO_3OP(NAME, FUNC, TYPE) \
1361 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1362                   float_status *stat, uint32_t desc)                       \
1363 {                                                                          \
1364     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1365     TYPE *d = vd, *n = vn, *m = vm;                                        \
1366     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1367         d[i] = FUNC(n[i], m[i], stat);                                     \
1368     }                                                                      \
1369     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1370 }
1371 
1372 DO_3OP(gvec_fadd_h, float16_add, float16)
1373 DO_3OP(gvec_fadd_s, float32_add, float32)
1374 DO_3OP(gvec_fadd_d, float64_add, float64)
1375 
1376 DO_3OP(gvec_fsub_h, float16_sub, float16)
1377 DO_3OP(gvec_fsub_s, float32_sub, float32)
1378 DO_3OP(gvec_fsub_d, float64_sub, float64)
1379 
1380 DO_3OP(gvec_fmul_h, float16_mul, float16)
1381 DO_3OP(gvec_fmul_s, float32_mul, float32)
1382 DO_3OP(gvec_fmul_d, float64_mul, float64)
1383 
1384 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1385 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1386 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1387 
1388 DO_3OP(gvec_fabd_h, float16_abd, float16)
1389 DO_3OP(gvec_fabd_s, float32_abd, float32)
1390 DO_3OP(gvec_fabd_d, float64_abd, float64)
1391 
1392 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1393 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1394 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1395 
1396 DO_3OP(gvec_fcge_h, float16_cge, float16)
1397 DO_3OP(gvec_fcge_s, float32_cge, float32)
1398 DO_3OP(gvec_fcge_d, float64_cge, float64)
1399 
1400 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1401 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1402 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1403 
1404 DO_3OP(gvec_facge_h, float16_acge, float16)
1405 DO_3OP(gvec_facge_s, float32_acge, float32)
1406 DO_3OP(gvec_facge_d, float64_acge, float64)
1407 
1408 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1409 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1410 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1411 
1412 DO_3OP(gvec_fmax_h, float16_max, float16)
1413 DO_3OP(gvec_fmax_s, float32_max, float32)
1414 DO_3OP(gvec_fmax_d, float64_max, float64)
1415 
1416 DO_3OP(gvec_fmin_h, float16_min, float16)
1417 DO_3OP(gvec_fmin_s, float32_min, float32)
1418 DO_3OP(gvec_fmin_d, float64_min, float64)
1419 
1420 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1421 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1422 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1423 
1424 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1425 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1426 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1427 
1428 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1429 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1430 
1431 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1432 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1433 
1434 #ifdef TARGET_AARCH64
1435 DO_3OP(gvec_fdiv_h, float16_div, float16)
1436 DO_3OP(gvec_fdiv_s, float32_div, float32)
1437 DO_3OP(gvec_fdiv_d, float64_div, float64)
1438 
1439 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1440 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1441 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1442 
1443 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1444 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1445 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1446 
1447 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1448 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1449 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1450 
1451 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1452 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1453 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1454 
1455 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1456 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1457 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1458 
1459 #endif
1460 #undef DO_3OP
1461 
1462 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1463 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1464                                  float_status *stat)
1465 {
1466     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1467 }
1468 
1469 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1470                                  float_status *stat)
1471 {
1472     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1473 }
1474 
1475 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1476                                  float_status *stat)
1477 {
1478     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1479 }
1480 
1481 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1482                                  float_status *stat)
1483 {
1484     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1485 }
1486 
1487 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1488 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1489                                 float_status *stat)
1490 {
1491     return float16_muladd(op1, op2, dest, 0, stat);
1492 }
1493 
1494 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1495                                  float_status *stat)
1496 {
1497     return float32_muladd(op1, op2, dest, 0, stat);
1498 }
1499 
1500 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1501                                  float_status *stat)
1502 {
1503     return float64_muladd(op1, op2, dest, 0, stat);
1504 }
1505 
1506 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1507                                  float_status *stat)
1508 {
1509     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1510 }
1511 
1512 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1513                                  float_status *stat)
1514 {
1515     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1516 }
1517 
1518 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1519                                  float_status *stat)
1520 {
1521     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1522 }
1523 
1524 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1525 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1526                   float_status *stat, uint32_t desc)                       \
1527 {                                                                          \
1528     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1529     TYPE *d = vd, *n = vn, *m = vm;                                        \
1530     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1531         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1532     }                                                                      \
1533     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1534 }
1535 
1536 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1537 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1538 
1539 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1540 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1541 
1542 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1543 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1544 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1545 
1546 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1547 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1548 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1549 
1550 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1551  * For AdvSIMD, there is of course only one such vector segment.
1552  */
1553 
1554 #define DO_MUL_IDX(NAME, TYPE, H) \
1555 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1556 {                                                                          \
1557     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1558     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1559     intptr_t idx = simd_data(desc);                                        \
1560     TYPE *d = vd, *n = vn, *m = vm;                                        \
1561     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1562         TYPE mm = m[H(i + idx)];                                           \
1563         for (j = 0; j < segment; j++) {                                    \
1564             d[i + j] = n[i + j] * mm;                                      \
1565         }                                                                  \
1566     }                                                                      \
1567     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1568 }
1569 
1570 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1571 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1572 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1573 
1574 #undef DO_MUL_IDX
1575 
1576 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1577 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1578 {                                                                          \
1579     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1580     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1581     intptr_t idx = simd_data(desc);                                        \
1582     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1583     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1584         TYPE mm = m[H(i + idx)];                                           \
1585         for (j = 0; j < segment; j++) {                                    \
1586             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1587         }                                                                  \
1588     }                                                                      \
1589     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1590 }
1591 
1592 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1593 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1594 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1595 
1596 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1597 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1598 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1599 
1600 #undef DO_MLA_IDX
1601 
1602 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1603 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1604                   float_status *stat, uint32_t desc)                       \
1605 {                                                                          \
1606     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1607     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1608     intptr_t idx = simd_data(desc);                                        \
1609     TYPE *d = vd, *n = vn, *m = vm;                                        \
1610     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1611         TYPE mm = m[H(i + idx)];                                           \
1612         for (j = 0; j < segment; j++) {                                    \
1613             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1614         }                                                                  \
1615     }                                                                      \
1616     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1617 }
1618 
1619 #define nop(N, M, S) (M)
1620 
1621 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1622 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1623 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1624 
1625 #ifdef TARGET_AARCH64
1626 
1627 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1628 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1629 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1630 
1631 #endif
1632 
1633 #undef nop
1634 
1635 /*
1636  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1637  * the fused ops below they assume accumulate both from and into Vd.
1638  */
1639 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1640 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1641 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1642 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1643 
1644 #undef DO_FMUL_IDX
1645 
1646 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1647 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1648                   float_status *stat, uint32_t desc)                       \
1649 {                                                                          \
1650     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1651     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1652     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1653     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1654     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1655     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1656     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1657         TYPE mm = m[H(i + idx)];                                           \
1658         for (j = 0; j < segment; j++) {                                    \
1659             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1660                                      mm, a[i + j], 0, stat);               \
1661         }                                                                  \
1662     }                                                                      \
1663     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1664 }
1665 
1666 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1667 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1668 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1669 
1670 #undef DO_FMLA_IDX
1671 
1672 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1673 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1674 {                                                                          \
1675     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1676     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1677     bool q = false;                                                        \
1678     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1679         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1680         if (dd < MIN) {                                                    \
1681             dd = MIN;                                                      \
1682             q = true;                                                      \
1683         } else if (dd > MAX) {                                             \
1684             dd = MAX;                                                      \
1685             q = true;                                                      \
1686         }                                                                  \
1687         d[i] = dd;                                                         \
1688     }                                                                      \
1689     if (q) {                                                               \
1690         uint32_t *qc = vq;                                                 \
1691         qc[0] = 1;                                                         \
1692     }                                                                      \
1693     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1694 }
1695 
1696 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1697 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1698 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1699 
1700 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1701 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1702 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1703 
1704 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1705 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1706 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1707 
1708 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1709 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1710 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1711 
1712 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1713 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1714 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1715 
1716 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1717 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1718 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1719 
1720 #undef DO_SAT
1721 
1722 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1723                           void *vm, uint32_t desc)
1724 {
1725     intptr_t i, oprsz = simd_oprsz(desc);
1726     uint64_t *d = vd, *n = vn, *m = vm;
1727     bool q = false;
1728 
1729     for (i = 0; i < oprsz / 8; i++) {
1730         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1731         if (dd < nn) {
1732             dd = UINT64_MAX;
1733             q = true;
1734         }
1735         d[i] = dd;
1736     }
1737     if (q) {
1738         uint32_t *qc = vq;
1739         qc[0] = 1;
1740     }
1741     clear_tail(d, oprsz, simd_maxsz(desc));
1742 }
1743 
1744 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1745                           void *vm, uint32_t desc)
1746 {
1747     intptr_t i, oprsz = simd_oprsz(desc);
1748     uint64_t *d = vd, *n = vn, *m = vm;
1749     bool q = false;
1750 
1751     for (i = 0; i < oprsz / 8; i++) {
1752         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1753         if (nn < mm) {
1754             dd = 0;
1755             q = true;
1756         }
1757         d[i] = dd;
1758     }
1759     if (q) {
1760         uint32_t *qc = vq;
1761         qc[0] = 1;
1762     }
1763     clear_tail(d, oprsz, simd_maxsz(desc));
1764 }
1765 
1766 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1767                           void *vm, uint32_t desc)
1768 {
1769     intptr_t i, oprsz = simd_oprsz(desc);
1770     int64_t *d = vd, *n = vn, *m = vm;
1771     bool q = false;
1772 
1773     for (i = 0; i < oprsz / 8; i++) {
1774         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1775         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1776             dd = (nn >> 63) ^ ~INT64_MIN;
1777             q = true;
1778         }
1779         d[i] = dd;
1780     }
1781     if (q) {
1782         uint32_t *qc = vq;
1783         qc[0] = 1;
1784     }
1785     clear_tail(d, oprsz, simd_maxsz(desc));
1786 }
1787 
1788 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1789                           void *vm, uint32_t desc)
1790 {
1791     intptr_t i, oprsz = simd_oprsz(desc);
1792     int64_t *d = vd, *n = vn, *m = vm;
1793     bool q = false;
1794 
1795     for (i = 0; i < oprsz / 8; i++) {
1796         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1797         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1798             dd = (nn >> 63) ^ ~INT64_MIN;
1799             q = true;
1800         }
1801         d[i] = dd;
1802     }
1803     if (q) {
1804         uint32_t *qc = vq;
1805         qc[0] = 1;
1806     }
1807     clear_tail(d, oprsz, simd_maxsz(desc));
1808 }
1809 
1810 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1811                            void *vm, uint32_t desc)
1812 {
1813     intptr_t i, oprsz = simd_oprsz(desc);
1814     uint64_t *d = vd, *n = vn, *m = vm;
1815     bool q = false;
1816 
1817     for (i = 0; i < oprsz / 8; i++) {
1818         uint64_t nn = n[i];
1819         int64_t mm = m[i];
1820         uint64_t dd = nn + mm;
1821 
1822         if (mm < 0) {
1823             if (nn < (uint64_t)-mm) {
1824                 dd = 0;
1825                 q = true;
1826             }
1827         } else {
1828             if (dd < nn) {
1829                 dd = UINT64_MAX;
1830                 q = true;
1831             }
1832         }
1833         d[i] = dd;
1834     }
1835     if (q) {
1836         uint32_t *qc = vq;
1837         qc[0] = 1;
1838     }
1839     clear_tail(d, oprsz, simd_maxsz(desc));
1840 }
1841 
1842 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1843                            void *vm, uint32_t desc)
1844 {
1845     intptr_t i, oprsz = simd_oprsz(desc);
1846     uint64_t *d = vd, *n = vn, *m = vm;
1847     bool q = false;
1848 
1849     for (i = 0; i < oprsz / 8; i++) {
1850         int64_t nn = n[i];
1851         uint64_t mm = m[i];
1852         int64_t dd = nn + mm;
1853 
1854         if (mm > (uint64_t)(INT64_MAX - nn)) {
1855             dd = INT64_MAX;
1856             q = true;
1857         }
1858         d[i] = dd;
1859     }
1860     if (q) {
1861         uint32_t *qc = vq;
1862         qc[0] = 1;
1863     }
1864     clear_tail(d, oprsz, simd_maxsz(desc));
1865 }
1866 
1867 #define DO_SRA(NAME, TYPE)                              \
1868 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1869 {                                                       \
1870     intptr_t i, oprsz = simd_oprsz(desc);               \
1871     int shift = simd_data(desc);                        \
1872     TYPE *d = vd, *n = vn;                              \
1873     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1874         d[i] += n[i] >> shift;                          \
1875     }                                                   \
1876     clear_tail(d, oprsz, simd_maxsz(desc));             \
1877 }
1878 
1879 DO_SRA(gvec_ssra_b, int8_t)
1880 DO_SRA(gvec_ssra_h, int16_t)
1881 DO_SRA(gvec_ssra_s, int32_t)
1882 DO_SRA(gvec_ssra_d, int64_t)
1883 
1884 DO_SRA(gvec_usra_b, uint8_t)
1885 DO_SRA(gvec_usra_h, uint16_t)
1886 DO_SRA(gvec_usra_s, uint32_t)
1887 DO_SRA(gvec_usra_d, uint64_t)
1888 
1889 #undef DO_SRA
1890 
1891 #define DO_RSHR(NAME, TYPE)                             \
1892 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1893 {                                                       \
1894     intptr_t i, oprsz = simd_oprsz(desc);               \
1895     int shift = simd_data(desc);                        \
1896     TYPE *d = vd, *n = vn;                              \
1897     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1898         TYPE tmp = n[i] >> (shift - 1);                 \
1899         d[i] = (tmp >> 1) + (tmp & 1);                  \
1900     }                                                   \
1901     clear_tail(d, oprsz, simd_maxsz(desc));             \
1902 }
1903 
1904 DO_RSHR(gvec_srshr_b, int8_t)
1905 DO_RSHR(gvec_srshr_h, int16_t)
1906 DO_RSHR(gvec_srshr_s, int32_t)
1907 DO_RSHR(gvec_srshr_d, int64_t)
1908 
1909 DO_RSHR(gvec_urshr_b, uint8_t)
1910 DO_RSHR(gvec_urshr_h, uint16_t)
1911 DO_RSHR(gvec_urshr_s, uint32_t)
1912 DO_RSHR(gvec_urshr_d, uint64_t)
1913 
1914 #undef DO_RSHR
1915 
1916 #define DO_RSRA(NAME, TYPE)                             \
1917 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1918 {                                                       \
1919     intptr_t i, oprsz = simd_oprsz(desc);               \
1920     int shift = simd_data(desc);                        \
1921     TYPE *d = vd, *n = vn;                              \
1922     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1923         TYPE tmp = n[i] >> (shift - 1);                 \
1924         d[i] += (tmp >> 1) + (tmp & 1);                 \
1925     }                                                   \
1926     clear_tail(d, oprsz, simd_maxsz(desc));             \
1927 }
1928 
1929 DO_RSRA(gvec_srsra_b, int8_t)
1930 DO_RSRA(gvec_srsra_h, int16_t)
1931 DO_RSRA(gvec_srsra_s, int32_t)
1932 DO_RSRA(gvec_srsra_d, int64_t)
1933 
1934 DO_RSRA(gvec_ursra_b, uint8_t)
1935 DO_RSRA(gvec_ursra_h, uint16_t)
1936 DO_RSRA(gvec_ursra_s, uint32_t)
1937 DO_RSRA(gvec_ursra_d, uint64_t)
1938 
1939 #undef DO_RSRA
1940 
1941 #define DO_SRI(NAME, TYPE)                              \
1942 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1943 {                                                       \
1944     intptr_t i, oprsz = simd_oprsz(desc);               \
1945     int shift = simd_data(desc);                        \
1946     TYPE *d = vd, *n = vn;                              \
1947     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1948         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1949     }                                                   \
1950     clear_tail(d, oprsz, simd_maxsz(desc));             \
1951 }
1952 
1953 DO_SRI(gvec_sri_b, uint8_t)
1954 DO_SRI(gvec_sri_h, uint16_t)
1955 DO_SRI(gvec_sri_s, uint32_t)
1956 DO_SRI(gvec_sri_d, uint64_t)
1957 
1958 #undef DO_SRI
1959 
1960 #define DO_SLI(NAME, TYPE)                              \
1961 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1962 {                                                       \
1963     intptr_t i, oprsz = simd_oprsz(desc);               \
1964     int shift = simd_data(desc);                        \
1965     TYPE *d = vd, *n = vn;                              \
1966     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1967         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1968     }                                                   \
1969     clear_tail(d, oprsz, simd_maxsz(desc));             \
1970 }
1971 
1972 DO_SLI(gvec_sli_b, uint8_t)
1973 DO_SLI(gvec_sli_h, uint16_t)
1974 DO_SLI(gvec_sli_s, uint32_t)
1975 DO_SLI(gvec_sli_d, uint64_t)
1976 
1977 #undef DO_SLI
1978 
1979 /*
1980  * Convert float16 to float32, raising no exceptions and
1981  * preserving exceptional values, including SNaN.
1982  * This is effectively an unpack+repack operation.
1983  */
1984 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1985 {
1986     const int f16_bias = 15;
1987     const int f32_bias = 127;
1988     uint32_t sign = extract32(f16, 15, 1);
1989     uint32_t exp = extract32(f16, 10, 5);
1990     uint32_t frac = extract32(f16, 0, 10);
1991 
1992     if (exp == 0x1f) {
1993         /* Inf or NaN */
1994         exp = 0xff;
1995     } else if (exp == 0) {
1996         /* Zero or denormal.  */
1997         if (frac != 0) {
1998             if (fz16) {
1999                 frac = 0;
2000             } else {
2001                 /*
2002                  * Denormal; these are all normal float32.
2003                  * Shift the fraction so that the msb is at bit 11,
2004                  * then remove bit 11 as the implicit bit of the
2005                  * normalized float32.  Note that we still go through
2006                  * the shift for normal numbers below, to put the
2007                  * float32 fraction at the right place.
2008                  */
2009                 int shift = clz32(frac) - 21;
2010                 frac = (frac << shift) & 0x3ff;
2011                 exp = f32_bias - f16_bias - shift + 1;
2012             }
2013         }
2014     } else {
2015         /* Normal number; adjust the bias.  */
2016         exp += f32_bias - f16_bias;
2017     }
2018     sign <<= 31;
2019     exp <<= 23;
2020     frac <<= 23 - 10;
2021 
2022     return sign | exp | frac;
2023 }
2024 
2025 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2026 {
2027     /*
2028      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2029      * Load the 2nd qword iff is_q & is_2.
2030      * Shift to the 2nd dword iff !is_q & is_2.
2031      * For !is_q & !is_2, the upper bits of the result are garbage.
2032      */
2033     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2034 }
2035 
2036 /*
2037  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2038  * as there is not yet SVE versions that might use blocking.
2039  */
2040 
2041 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2042                      uint32_t desc, bool fz16)
2043 {
2044     intptr_t i, oprsz = simd_oprsz(desc);
2045     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2046     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2047     int is_q = oprsz == 16;
2048     uint64_t n_4, m_4;
2049 
2050     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2051     n_4 = load4_f16(vn, is_q, is_2);
2052     m_4 = load4_f16(vm, is_q, is_2);
2053 
2054     /* Negate all inputs for FMLSL at once.  */
2055     if (is_s) {
2056         n_4 ^= 0x8000800080008000ull;
2057     }
2058 
2059     for (i = 0; i < oprsz / 4; i++) {
2060         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2061         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2062         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2063     }
2064     clear_tail(d, oprsz, simd_maxsz(desc));
2065 }
2066 
2067 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2068                             CPUARMState *env, uint32_t desc)
2069 {
2070     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2071              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2072 }
2073 
2074 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2075                             CPUARMState *env, uint32_t desc)
2076 {
2077     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2078              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2079 }
2080 
2081 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2082                                CPUARMState *env, uint32_t desc)
2083 {
2084     intptr_t i, oprsz = simd_oprsz(desc);
2085     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2086     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2087     float_status *status = &env->vfp.fp_status_a64;
2088     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2089 
2090     for (i = 0; i < oprsz; i += sizeof(float32)) {
2091         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2092         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2093         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2094         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2095         float32 aa = *(float32 *)(va + H1_4(i));
2096 
2097         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2098     }
2099 }
2100 
2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2102                          uint32_t desc, bool fz16)
2103 {
2104     intptr_t i, oprsz = simd_oprsz(desc);
2105     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2106     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2107     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2108     int is_q = oprsz == 16;
2109     uint64_t n_4;
2110     float32 m_1;
2111 
2112     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2113     n_4 = load4_f16(vn, is_q, is_2);
2114 
2115     /* Negate all inputs for FMLSL at once.  */
2116     if (is_s) {
2117         n_4 ^= 0x8000800080008000ull;
2118     }
2119 
2120     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2121 
2122     for (i = 0; i < oprsz / 4; i++) {
2123         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2124         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2125     }
2126     clear_tail(d, oprsz, simd_maxsz(desc));
2127 }
2128 
2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2130                                 CPUARMState *env, uint32_t desc)
2131 {
2132     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2133                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2134 }
2135 
2136 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2137                                 CPUARMState *env, uint32_t desc)
2138 {
2139     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2140                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2141 }
2142 
2143 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2144                                CPUARMState *env, uint32_t desc)
2145 {
2146     intptr_t i, j, oprsz = simd_oprsz(desc);
2147     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2148     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2149     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2150     float_status *status = &env->vfp.fp_status_a64;
2151     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2152 
2153     for (i = 0; i < oprsz; i += 16) {
2154         float16 mm_16 = *(float16 *)(vm + i + idx);
2155         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2156 
2157         for (j = 0; j < 16; j += sizeof(float32)) {
2158             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2159             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2160             float32 aa = *(float32 *)(va + H1_4(i + j));
2161 
2162             *(float32 *)(vd + H1_4(i + j)) =
2163                 float32_muladd(nn, mm, aa, 0, status);
2164         }
2165     }
2166 }
2167 
2168 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2169 {
2170     intptr_t i, opr_sz = simd_oprsz(desc);
2171     int8_t *d = vd, *n = vn, *m = vm;
2172 
2173     for (i = 0; i < opr_sz; ++i) {
2174         int8_t mm = m[i];
2175         int8_t nn = n[i];
2176         int8_t res = 0;
2177         if (mm >= 0) {
2178             if (mm < 8) {
2179                 res = nn << mm;
2180             }
2181         } else {
2182             res = nn >> (mm > -8 ? -mm : 7);
2183         }
2184         d[i] = res;
2185     }
2186     clear_tail(d, opr_sz, simd_maxsz(desc));
2187 }
2188 
2189 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2190 {
2191     intptr_t i, opr_sz = simd_oprsz(desc);
2192     int16_t *d = vd, *n = vn, *m = vm;
2193 
2194     for (i = 0; i < opr_sz / 2; ++i) {
2195         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2196         int16_t nn = n[i];
2197         int16_t res = 0;
2198         if (mm >= 0) {
2199             if (mm < 16) {
2200                 res = nn << mm;
2201             }
2202         } else {
2203             res = nn >> (mm > -16 ? -mm : 15);
2204         }
2205         d[i] = res;
2206     }
2207     clear_tail(d, opr_sz, simd_maxsz(desc));
2208 }
2209 
2210 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2211 {
2212     intptr_t i, opr_sz = simd_oprsz(desc);
2213     uint8_t *d = vd, *n = vn, *m = vm;
2214 
2215     for (i = 0; i < opr_sz; ++i) {
2216         int8_t mm = m[i];
2217         uint8_t nn = n[i];
2218         uint8_t res = 0;
2219         if (mm >= 0) {
2220             if (mm < 8) {
2221                 res = nn << mm;
2222             }
2223         } else {
2224             if (mm > -8) {
2225                 res = nn >> -mm;
2226             }
2227         }
2228         d[i] = res;
2229     }
2230     clear_tail(d, opr_sz, simd_maxsz(desc));
2231 }
2232 
2233 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2234 {
2235     intptr_t i, opr_sz = simd_oprsz(desc);
2236     uint16_t *d = vd, *n = vn, *m = vm;
2237 
2238     for (i = 0; i < opr_sz / 2; ++i) {
2239         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2240         uint16_t nn = n[i];
2241         uint16_t res = 0;
2242         if (mm >= 0) {
2243             if (mm < 16) {
2244                 res = nn << mm;
2245             }
2246         } else {
2247             if (mm > -16) {
2248                 res = nn >> -mm;
2249             }
2250         }
2251         d[i] = res;
2252     }
2253     clear_tail(d, opr_sz, simd_maxsz(desc));
2254 }
2255 
2256 /*
2257  * 8x8->8 polynomial multiply.
2258  *
2259  * Polynomial multiplication is like integer multiplication except the
2260  * partial products are XORed, not added.
2261  *
2262  * TODO: expose this as a generic vector operation, as it is a common
2263  * crypto building block.
2264  */
2265 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2266 {
2267     intptr_t i, opr_sz = simd_oprsz(desc);
2268     uint64_t *d = vd, *n = vn, *m = vm;
2269 
2270     for (i = 0; i < opr_sz / 8; ++i) {
2271         d[i] = clmul_8x8_low(n[i], m[i]);
2272     }
2273     clear_tail(d, opr_sz, simd_maxsz(desc));
2274 }
2275 
2276 /*
2277  * 64x64->128 polynomial multiply.
2278  * Because of the lanes are not accessed in strict columns,
2279  * this probably cannot be turned into a generic helper.
2280  */
2281 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2282 {
2283     intptr_t i, opr_sz = simd_oprsz(desc);
2284     intptr_t hi = simd_data(desc);
2285     uint64_t *d = vd, *n = vn, *m = vm;
2286 
2287     for (i = 0; i < opr_sz / 8; i += 2) {
2288         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2289         d[i] = int128_getlo(r);
2290         d[i + 1] = int128_gethi(r);
2291     }
2292     clear_tail(d, opr_sz, simd_maxsz(desc));
2293 }
2294 
2295 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2296 {
2297     int hi = simd_data(desc);
2298     uint64_t *d = vd, *n = vn, *m = vm;
2299     uint64_t nn = n[hi], mm = m[hi];
2300 
2301     d[0] = clmul_8x4_packed(nn, mm);
2302     nn >>= 32;
2303     mm >>= 32;
2304     d[1] = clmul_8x4_packed(nn, mm);
2305 
2306     clear_tail(d, 16, simd_maxsz(desc));
2307 }
2308 
2309 #ifdef TARGET_AARCH64
2310 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2311 {
2312     int shift = simd_data(desc) * 8;
2313     intptr_t i, opr_sz = simd_oprsz(desc);
2314     uint64_t *d = vd, *n = vn, *m = vm;
2315 
2316     for (i = 0; i < opr_sz / 8; ++i) {
2317         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2318     }
2319 }
2320 
2321 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2322 {
2323     intptr_t sel = H4(simd_data(desc));
2324     intptr_t i, opr_sz = simd_oprsz(desc);
2325     uint32_t *n = vn, *m = vm;
2326     uint64_t *d = vd;
2327 
2328     for (i = 0; i < opr_sz / 8; ++i) {
2329         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2330     }
2331 }
2332 #endif
2333 
2334 #define DO_CMP0(NAME, TYPE, OP)                         \
2335 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2336 {                                                       \
2337     intptr_t i, opr_sz = simd_oprsz(desc);              \
2338     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2339         TYPE nn = *(TYPE *)(vn + i);                    \
2340         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2341     }                                                   \
2342     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2343 }
2344 
2345 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2346 DO_CMP0(gvec_clt0_b, int8_t, <)
2347 DO_CMP0(gvec_cle0_b, int8_t, <=)
2348 DO_CMP0(gvec_cgt0_b, int8_t, >)
2349 DO_CMP0(gvec_cge0_b, int8_t, >=)
2350 
2351 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2352 DO_CMP0(gvec_clt0_h, int16_t, <)
2353 DO_CMP0(gvec_cle0_h, int16_t, <=)
2354 DO_CMP0(gvec_cgt0_h, int16_t, >)
2355 DO_CMP0(gvec_cge0_h, int16_t, >=)
2356 
2357 #undef DO_CMP0
2358 
2359 #define DO_ABD(NAME, TYPE)                                      \
2360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2361 {                                                               \
2362     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2363     TYPE *d = vd, *n = vn, *m = vm;                             \
2364                                                                 \
2365     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2366         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2367     }                                                           \
2368     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2369 }
2370 
2371 DO_ABD(gvec_sabd_b, int8_t)
2372 DO_ABD(gvec_sabd_h, int16_t)
2373 DO_ABD(gvec_sabd_s, int32_t)
2374 DO_ABD(gvec_sabd_d, int64_t)
2375 
2376 DO_ABD(gvec_uabd_b, uint8_t)
2377 DO_ABD(gvec_uabd_h, uint16_t)
2378 DO_ABD(gvec_uabd_s, uint32_t)
2379 DO_ABD(gvec_uabd_d, uint64_t)
2380 
2381 #undef DO_ABD
2382 
2383 #define DO_ABA(NAME, TYPE)                                      \
2384 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2385 {                                                               \
2386     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2387     TYPE *d = vd, *n = vn, *m = vm;                             \
2388                                                                 \
2389     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2390         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2391     }                                                           \
2392     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2393 }
2394 
2395 DO_ABA(gvec_saba_b, int8_t)
2396 DO_ABA(gvec_saba_h, int16_t)
2397 DO_ABA(gvec_saba_s, int32_t)
2398 DO_ABA(gvec_saba_d, int64_t)
2399 
2400 DO_ABA(gvec_uaba_b, uint8_t)
2401 DO_ABA(gvec_uaba_h, uint16_t)
2402 DO_ABA(gvec_uaba_s, uint32_t)
2403 DO_ABA(gvec_uaba_d, uint64_t)
2404 
2405 #undef DO_ABA
2406 
2407 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2408 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2409                   float_status *stat, uint32_t desc)                       \
2410 {                                                                          \
2411     ARMVectorReg scratch;                                                  \
2412     intptr_t oprsz = simd_oprsz(desc);                                     \
2413     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2414     TYPE *d = vd, *n = vn, *m = vm;                                        \
2415     if (unlikely(d == m)) {                                                \
2416         m = memcpy(&scratch, m, oprsz);                                    \
2417     }                                                                      \
2418     for (intptr_t i = 0; i < half; ++i) {                                  \
2419         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2420     }                                                                      \
2421     for (intptr_t i = 0; i < half; ++i) {                                  \
2422         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2423     }                                                                      \
2424     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2425 }
2426 
2427 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2428 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2429 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2430 
2431 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2432 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2433 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2434 
2435 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2436 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2437 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2438 
2439 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2440 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2441 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2442 
2443 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2444 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2445 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2446 
2447 #ifdef TARGET_AARCH64
2448 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2449 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2450 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2451 
2452 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2453 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2454 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2455 #endif
2456 
2457 #undef DO_3OP_PAIR
2458 
2459 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2460 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2461 {                                                               \
2462     ARMVectorReg scratch;                                       \
2463     intptr_t oprsz = simd_oprsz(desc);                          \
2464     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2465     TYPE *d = vd, *n = vn, *m = vm;                             \
2466     if (unlikely(d == m)) {                                     \
2467         m = memcpy(&scratch, m, oprsz);                         \
2468     }                                                           \
2469     for (intptr_t i = 0; i < half; ++i) {                       \
2470         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2471     }                                                           \
2472     for (intptr_t i = 0; i < half; ++i) {                       \
2473         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2474     }                                                           \
2475     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2476 }
2477 
2478 #define ADD(A, B) (A + B)
2479 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2480 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2481 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2482 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2483 #undef  ADD
2484 
2485 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2486 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2487 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2488 
2489 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2490 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2491 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2492 
2493 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2494 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2495 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2496 
2497 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2498 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2499 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2500 
2501 #undef DO_3OP_PAIR
2502 
2503 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2504     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2505     {                                                                   \
2506         intptr_t i, oprsz = simd_oprsz(desc);                           \
2507         int shift = simd_data(desc);                                    \
2508         TYPE *d = vd, *n = vn;                                          \
2509         float_status *fpst = stat;                                      \
2510         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2511             d[i] = FUNC(n[i], shift, fpst);                             \
2512         }                                                               \
2513         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2514     }
2515 
2516 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2517 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2518 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2519 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2520 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2521 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2522 
2523 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2524 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2525 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2526 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2527 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2528 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2529 
2530 #undef DO_VCVT_FIXED
2531 
2532 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2533     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2534     {                                                                   \
2535         intptr_t i, oprsz = simd_oprsz(desc);                           \
2536         uint32_t rmode = simd_data(desc);                               \
2537         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2538         TYPE *d = vd, *n = vn;                                          \
2539         set_float_rounding_mode(rmode, fpst);                           \
2540         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2541             d[i] = FUNC(n[i], 0, fpst);                                 \
2542         }                                                               \
2543         set_float_rounding_mode(prev_rmode, fpst);                      \
2544         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2545     }
2546 
2547 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2548 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2549 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2550 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2551 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2552 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2553 
2554 #undef DO_VCVT_RMODE
2555 
2556 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2557     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2558     {                                                                   \
2559         intptr_t i, oprsz = simd_oprsz(desc);                           \
2560         uint32_t rmode = simd_data(desc);                               \
2561         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2562         TYPE *d = vd, *n = vn;                                          \
2563         set_float_rounding_mode(rmode, fpst);                           \
2564         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2565             d[i] = FUNC(n[i], fpst);                                    \
2566         }                                                               \
2567         set_float_rounding_mode(prev_rmode, fpst);                      \
2568         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2569     }
2570 
2571 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2572 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2573 
2574 #undef DO_VRINT_RMODE
2575 
2576 #ifdef TARGET_AARCH64
2577 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2578 {
2579     const uint8_t *indices = vm;
2580     size_t oprsz = simd_oprsz(desc);
2581     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2582     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2583     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2584     union {
2585         uint8_t b[16];
2586         uint64_t d[2];
2587     } result;
2588 
2589     /*
2590      * We must construct the final result in a temp, lest the output
2591      * overlaps the input table.  For TBL, begin with zero; for TBX,
2592      * begin with the original register contents.  Note that we always
2593      * copy 16 bytes here to avoid an extra branch; clearing the high
2594      * bits of the register for oprsz == 8 is handled below.
2595      */
2596     if (is_tbx) {
2597         memcpy(&result, vd, 16);
2598     } else {
2599         memset(&result, 0, 16);
2600     }
2601 
2602     for (size_t i = 0; i < oprsz; ++i) {
2603         uint32_t index = indices[H1(i)];
2604 
2605         if (index < table_len) {
2606             /*
2607              * Convert index (a byte offset into the virtual table
2608              * which is a series of 128-bit vectors concatenated)
2609              * into the correct register element, bearing in mind
2610              * that the table can wrap around from V31 to V0.
2611              */
2612             const uint8_t *table = (const uint8_t *)
2613                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2614             result.b[H1(i)] = table[H1(index % 16)];
2615         }
2616     }
2617 
2618     memcpy(vd, &result, 16);
2619     clear_tail(vd, oprsz, simd_maxsz(desc));
2620 }
2621 #endif
2622 
2623 /*
2624  * NxN -> N highpart multiply
2625  *
2626  * TODO: expose this as a generic vector operation.
2627  */
2628 
2629 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2630 {
2631     intptr_t i, opr_sz = simd_oprsz(desc);
2632     int8_t *d = vd, *n = vn, *m = vm;
2633 
2634     for (i = 0; i < opr_sz; ++i) {
2635         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2636     }
2637     clear_tail(d, opr_sz, simd_maxsz(desc));
2638 }
2639 
2640 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2641 {
2642     intptr_t i, opr_sz = simd_oprsz(desc);
2643     int16_t *d = vd, *n = vn, *m = vm;
2644 
2645     for (i = 0; i < opr_sz / 2; ++i) {
2646         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2647     }
2648     clear_tail(d, opr_sz, simd_maxsz(desc));
2649 }
2650 
2651 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2652 {
2653     intptr_t i, opr_sz = simd_oprsz(desc);
2654     int32_t *d = vd, *n = vn, *m = vm;
2655 
2656     for (i = 0; i < opr_sz / 4; ++i) {
2657         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2658     }
2659     clear_tail(d, opr_sz, simd_maxsz(desc));
2660 }
2661 
2662 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2663 {
2664     intptr_t i, opr_sz = simd_oprsz(desc);
2665     uint64_t *d = vd, *n = vn, *m = vm;
2666     uint64_t discard;
2667 
2668     for (i = 0; i < opr_sz / 8; ++i) {
2669         muls64(&discard, &d[i], n[i], m[i]);
2670     }
2671     clear_tail(d, opr_sz, simd_maxsz(desc));
2672 }
2673 
2674 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2675 {
2676     intptr_t i, opr_sz = simd_oprsz(desc);
2677     uint8_t *d = vd, *n = vn, *m = vm;
2678 
2679     for (i = 0; i < opr_sz; ++i) {
2680         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2681     }
2682     clear_tail(d, opr_sz, simd_maxsz(desc));
2683 }
2684 
2685 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2686 {
2687     intptr_t i, opr_sz = simd_oprsz(desc);
2688     uint16_t *d = vd, *n = vn, *m = vm;
2689 
2690     for (i = 0; i < opr_sz / 2; ++i) {
2691         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2692     }
2693     clear_tail(d, opr_sz, simd_maxsz(desc));
2694 }
2695 
2696 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2697 {
2698     intptr_t i, opr_sz = simd_oprsz(desc);
2699     uint32_t *d = vd, *n = vn, *m = vm;
2700 
2701     for (i = 0; i < opr_sz / 4; ++i) {
2702         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2703     }
2704     clear_tail(d, opr_sz, simd_maxsz(desc));
2705 }
2706 
2707 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2708 {
2709     intptr_t i, opr_sz = simd_oprsz(desc);
2710     uint64_t *d = vd, *n = vn, *m = vm;
2711     uint64_t discard;
2712 
2713     for (i = 0; i < opr_sz / 8; ++i) {
2714         mulu64(&discard, &d[i], n[i], m[i]);
2715     }
2716     clear_tail(d, opr_sz, simd_maxsz(desc));
2717 }
2718 
2719 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2720 {
2721     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2722     int shr = simd_data(desc);
2723     uint64_t *d = vd, *n = vn, *m = vm;
2724 
2725     for (i = 0; i < opr_sz; ++i) {
2726         d[i] = ror64(n[i] ^ m[i], shr);
2727     }
2728     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2729 }
2730 
2731 /*
2732  * Integer matrix-multiply accumulate
2733  */
2734 
2735 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2736 {
2737     int8_t *n = vn, *m = vm;
2738 
2739     for (intptr_t k = 0; k < 8; ++k) {
2740         sum += n[H1(k)] * m[H1(k)];
2741     }
2742     return sum;
2743 }
2744 
2745 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2746 {
2747     uint8_t *n = vn, *m = vm;
2748 
2749     for (intptr_t k = 0; k < 8; ++k) {
2750         sum += n[H1(k)] * m[H1(k)];
2751     }
2752     return sum;
2753 }
2754 
2755 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2756 {
2757     uint8_t *n = vn;
2758     int8_t *m = vm;
2759 
2760     for (intptr_t k = 0; k < 8; ++k) {
2761         sum += n[H1(k)] * m[H1(k)];
2762     }
2763     return sum;
2764 }
2765 
2766 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2767                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2768 {
2769     intptr_t seg, opr_sz = simd_oprsz(desc);
2770 
2771     for (seg = 0; seg < opr_sz; seg += 16) {
2772         uint32_t *d = vd + seg;
2773         uint32_t *a = va + seg;
2774         uint32_t sum0, sum1, sum2, sum3;
2775 
2776         /*
2777          * Process the entire segment at once, writing back the
2778          * results only after we've consumed all of the inputs.
2779          *
2780          * Key to indices by column:
2781          *          i   j                  i             j
2782          */
2783         sum0 = a[H4(0 + 0)];
2784         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2785         sum1 = a[H4(0 + 1)];
2786         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2787         sum2 = a[H4(2 + 0)];
2788         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2789         sum3 = a[H4(2 + 1)];
2790         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2791 
2792         d[H4(0)] = sum0;
2793         d[H4(1)] = sum1;
2794         d[H4(2)] = sum2;
2795         d[H4(3)] = sum3;
2796     }
2797     clear_tail(vd, opr_sz, simd_maxsz(desc));
2798 }
2799 
2800 #define DO_MMLA_B(NAME, INNER) \
2801     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2802     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2803 
2804 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2805 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2806 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2807 
2808 /*
2809  * BFloat16 Dot Product
2810  */
2811 
2812 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2813 {
2814     /*
2815      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2816      * For EBF = 0, we ignore the FPCR bits which determine rounding
2817      * mode and denormal-flushing, and we do unfused multiplies and
2818      * additions with intermediate rounding of all products and sums.
2819      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2820      * and we perform a fused two-way sum-of-products without intermediate
2821      * rounding of the products.
2822      * In either case, we don't set fp exception flags.
2823      *
2824      * EBF is AArch64 only, so even if it's set in the FPCR it has
2825      * no effect on AArch32 instructions.
2826      */
2827     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2828 
2829     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2830     set_default_nan_mode(true, statusp);
2831 
2832     if (ebf) {
2833         /* EBF=1 needs to do a step with round-to-odd semantics */
2834         *oddstatusp = *statusp;
2835         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2836     } else {
2837         set_flush_to_zero(true, statusp);
2838         set_flush_inputs_to_zero(true, statusp);
2839         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2840     }
2841     return ebf;
2842 }
2843 
2844 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2845 {
2846     float32 t1, t2;
2847 
2848     /*
2849      * Extract each BFloat16 from the element pair, and shift
2850      * them such that they become float32.
2851      */
2852     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2853     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2854     t1 = float32_add(t1, t2, fpst);
2855     t1 = float32_add(sum, t1, fpst);
2856 
2857     return t1;
2858 }
2859 
2860 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2861                      float_status *fpst, float_status *fpst_odd)
2862 {
2863     /*
2864      * Compare f16_dotadd() in sme_helper.c, but here we have
2865      * bfloat16 inputs. In particular that means that we do not
2866      * want the FPCR.FZ16 flush semantics, so we use the normal
2867      * float_status for the input handling here.
2868      */
2869     float64 e1r = float32_to_float64(e1 << 16, fpst);
2870     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2871     float64 e2r = float32_to_float64(e2 << 16, fpst);
2872     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2873     float64 t64;
2874     float32 t32;
2875 
2876     /*
2877      * The ARM pseudocode function FPDot performs both multiplies
2878      * and the add with a single rounding operation.  Emulate this
2879      * by performing the first multiply in round-to-odd, then doing
2880      * the second multiply as fused multiply-add, and rounding to
2881      * float32 all in one step.
2882      */
2883     t64 = float64_mul(e1r, e2r, fpst_odd);
2884     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2885 
2886     /* This conversion is exact, because we've already rounded. */
2887     t32 = float64_to_float32(t64, fpst);
2888 
2889     /* The final accumulation step is not fused. */
2890     return float32_add(sum, t32, fpst);
2891 }
2892 
2893 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2894                         CPUARMState *env, uint32_t desc)
2895 {
2896     intptr_t i, opr_sz = simd_oprsz(desc);
2897     float32 *d = vd, *a = va;
2898     uint32_t *n = vn, *m = vm;
2899     float_status fpst, fpst_odd;
2900 
2901     if (is_ebf(env, &fpst, &fpst_odd)) {
2902         for (i = 0; i < opr_sz / 4; ++i) {
2903             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2904         }
2905     } else {
2906         for (i = 0; i < opr_sz / 4; ++i) {
2907             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2908         }
2909     }
2910     clear_tail(d, opr_sz, simd_maxsz(desc));
2911 }
2912 
2913 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2914                             void *va, CPUARMState *env, uint32_t desc)
2915 {
2916     intptr_t i, j, opr_sz = simd_oprsz(desc);
2917     intptr_t index = simd_data(desc);
2918     intptr_t elements = opr_sz / 4;
2919     intptr_t eltspersegment = MIN(16 / 4, elements);
2920     float32 *d = vd, *a = va;
2921     uint32_t *n = vn, *m = vm;
2922     float_status fpst, fpst_odd;
2923 
2924     if (is_ebf(env, &fpst, &fpst_odd)) {
2925         for (i = 0; i < elements; i += eltspersegment) {
2926             uint32_t m_idx = m[i + H4(index)];
2927 
2928             for (j = i; j < i + eltspersegment; j++) {
2929                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2930             }
2931         }
2932     } else {
2933         for (i = 0; i < elements; i += eltspersegment) {
2934             uint32_t m_idx = m[i + H4(index)];
2935 
2936             for (j = i; j < i + eltspersegment; j++) {
2937                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2938             }
2939         }
2940     }
2941     clear_tail(d, opr_sz, simd_maxsz(desc));
2942 }
2943 
2944 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2945                          CPUARMState *env, uint32_t desc)
2946 {
2947     intptr_t s, opr_sz = simd_oprsz(desc);
2948     float32 *d = vd, *a = va;
2949     uint32_t *n = vn, *m = vm;
2950     float_status fpst, fpst_odd;
2951 
2952     if (is_ebf(env, &fpst, &fpst_odd)) {
2953         for (s = 0; s < opr_sz / 4; s += 4) {
2954             float32 sum00, sum01, sum10, sum11;
2955 
2956             /*
2957              * Process the entire segment at once, writing back the
2958              * results only after we've consumed all of the inputs.
2959              *
2960              * Key to indices by column:
2961              *               i   j               i   k             j   k
2962              */
2963             sum00 = a[s + H4(0 + 0)];
2964             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2965             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2966 
2967             sum01 = a[s + H4(0 + 1)];
2968             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2969             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2970 
2971             sum10 = a[s + H4(2 + 0)];
2972             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2973             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2974 
2975             sum11 = a[s + H4(2 + 1)];
2976             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2977             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2978 
2979             d[s + H4(0 + 0)] = sum00;
2980             d[s + H4(0 + 1)] = sum01;
2981             d[s + H4(2 + 0)] = sum10;
2982             d[s + H4(2 + 1)] = sum11;
2983         }
2984     } else {
2985         for (s = 0; s < opr_sz / 4; s += 4) {
2986             float32 sum00, sum01, sum10, sum11;
2987 
2988             /*
2989              * Process the entire segment at once, writing back the
2990              * results only after we've consumed all of the inputs.
2991              *
2992              * Key to indices by column:
2993              *               i   j           i   k             j   k
2994              */
2995             sum00 = a[s + H4(0 + 0)];
2996             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2997             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2998 
2999             sum01 = a[s + H4(0 + 1)];
3000             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3001             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3002 
3003             sum10 = a[s + H4(2 + 0)];
3004             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3005             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3006 
3007             sum11 = a[s + H4(2 + 1)];
3008             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3009             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3010 
3011             d[s + H4(0 + 0)] = sum00;
3012             d[s + H4(0 + 1)] = sum01;
3013             d[s + H4(2 + 0)] = sum10;
3014             d[s + H4(2 + 1)] = sum11;
3015         }
3016     }
3017     clear_tail(d, opr_sz, simd_maxsz(desc));
3018 }
3019 
3020 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3021                          float_status *stat, uint32_t desc)
3022 {
3023     intptr_t i, opr_sz = simd_oprsz(desc);
3024     intptr_t sel = simd_data(desc);
3025     float32 *d = vd, *a = va;
3026     bfloat16 *n = vn, *m = vm;
3027 
3028     for (i = 0; i < opr_sz / 4; ++i) {
3029         float32 nn = n[H2(i * 2 + sel)] << 16;
3030         float32 mm = m[H2(i * 2 + sel)] << 16;
3031         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3032     }
3033     clear_tail(d, opr_sz, simd_maxsz(desc));
3034 }
3035 
3036 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3037                              void *va, float_status *stat, uint32_t desc)
3038 {
3039     intptr_t i, j, opr_sz = simd_oprsz(desc);
3040     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3041     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3042     intptr_t elements = opr_sz / 4;
3043     intptr_t eltspersegment = MIN(16 / 4, elements);
3044     float32 *d = vd, *a = va;
3045     bfloat16 *n = vn, *m = vm;
3046 
3047     for (i = 0; i < elements; i += eltspersegment) {
3048         float32 m_idx = m[H2(2 * i + index)] << 16;
3049 
3050         for (j = i; j < i + eltspersegment; j++) {
3051             float32 n_j = n[H2(2 * j + sel)] << 16;
3052             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3053         }
3054     }
3055     clear_tail(d, opr_sz, simd_maxsz(desc));
3056 }
3057 
3058 #define DO_CLAMP(NAME, TYPE) \
3059 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3060 {                                                                       \
3061     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3062     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3063         TYPE aa = *(TYPE *)(a + i);                                     \
3064         TYPE nn = *(TYPE *)(n + i);                                     \
3065         TYPE mm = *(TYPE *)(m + i);                                     \
3066         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3067         *(TYPE *)(d + i) = dd;                                          \
3068     }                                                                   \
3069     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3070 }
3071 
3072 DO_CLAMP(gvec_sclamp_b, int8_t)
3073 DO_CLAMP(gvec_sclamp_h, int16_t)
3074 DO_CLAMP(gvec_sclamp_s, int32_t)
3075 DO_CLAMP(gvec_sclamp_d, int64_t)
3076 
3077 DO_CLAMP(gvec_uclamp_b, uint8_t)
3078 DO_CLAMP(gvec_uclamp_h, uint16_t)
3079 DO_CLAMP(gvec_uclamp_s, uint32_t)
3080 DO_CLAMP(gvec_uclamp_d, uint64_t)
3081 
3082 /* Bit count in each 8-bit word. */
3083 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3084 {
3085     intptr_t i, opr_sz = simd_oprsz(desc);
3086     uint8_t *d = vd, *n = vn;
3087 
3088     for (i = 0; i < opr_sz; ++i) {
3089         d[i] = ctpop8(n[i]);
3090     }
3091     clear_tail(d, opr_sz, simd_maxsz(desc));
3092 }
3093 
3094 /* Reverse bits in each 8 bit word */
3095 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3096 {
3097     intptr_t i, opr_sz = simd_oprsz(desc);
3098     uint64_t *d = vd, *n = vn;
3099 
3100     for (i = 0; i < opr_sz / 8; ++i) {
3101         d[i] = revbit64(bswap64(n[i]));
3102     }
3103     clear_tail(d, opr_sz, simd_maxsz(desc));
3104 }
3105 
3106 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3107 {
3108     intptr_t i, opr_sz = simd_oprsz(desc);
3109     uint32_t *d = vd, *n = vn;
3110 
3111     for (i = 0; i < opr_sz / 4; ++i) {
3112         d[i] = helper_recpe_u32(n[i]);
3113     }
3114     clear_tail(d, opr_sz, simd_maxsz(desc));
3115 }
3116 
3117 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3118 {
3119     intptr_t i, opr_sz = simd_oprsz(desc);
3120     uint32_t *d = vd, *n = vn;
3121 
3122     for (i = 0; i < opr_sz / 4; ++i) {
3123         d[i] = helper_rsqrte_u32(n[i]);
3124     }
3125     clear_tail(d, opr_sz, simd_maxsz(desc));
3126 }
3127