xref: /qemu/target/arm/tcg/vec_helper.c (revision d613e44010658ced19ac9add6a9587bef1d63c67)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
883     uint32_t neg_imag = neg_real ^ 1;
884     uintptr_t i;
885 
886     /* Shift boolean to the sign bit so we can xor to negate.  */
887     neg_real <<= 15;
888     neg_imag <<= 15;
889 
890     for (i = 0; i < opr_sz / 2; i += 2) {
891         float16 e0 = n[H2(i)];
892         float16 e1 = m[H2(i + 1)] ^ neg_imag;
893         float16 e2 = n[H2(i + 1)];
894         float16 e3 = m[H2(i)] ^ neg_real;
895 
896         d[H2(i)] = float16_add(e0, e1, fpst);
897         d[H2(i + 1)] = float16_add(e2, e3, fpst);
898     }
899     clear_tail(d, opr_sz, simd_maxsz(desc));
900 }
901 
902 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
903                          float_status *fpst, uint32_t desc)
904 {
905     uintptr_t opr_sz = simd_oprsz(desc);
906     float32 *d = vd;
907     float32 *n = vn;
908     float32 *m = vm;
909     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
910     uint32_t neg_imag = neg_real ^ 1;
911     uintptr_t i;
912 
913     /* Shift boolean to the sign bit so we can xor to negate.  */
914     neg_real <<= 31;
915     neg_imag <<= 31;
916 
917     for (i = 0; i < opr_sz / 4; i += 2) {
918         float32 e0 = n[H4(i)];
919         float32 e1 = m[H4(i + 1)] ^ neg_imag;
920         float32 e2 = n[H4(i + 1)];
921         float32 e3 = m[H4(i)] ^ neg_real;
922 
923         d[H4(i)] = float32_add(e0, e1, fpst);
924         d[H4(i + 1)] = float32_add(e2, e3, fpst);
925     }
926     clear_tail(d, opr_sz, simd_maxsz(desc));
927 }
928 
929 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
930                          float_status *fpst, uint32_t desc)
931 {
932     uintptr_t opr_sz = simd_oprsz(desc);
933     float64 *d = vd;
934     float64 *n = vn;
935     float64 *m = vm;
936     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
937     uint64_t neg_imag = neg_real ^ 1;
938     uintptr_t i;
939 
940     /* Shift boolean to the sign bit so we can xor to negate.  */
941     neg_real <<= 63;
942     neg_imag <<= 63;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1] ^ neg_imag;
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i] ^ neg_real;
949 
950         d[i] = float64_add(e0, e1, fpst);
951         d[i + 1] = float64_add(e2, e3, fpst);
952     }
953     clear_tail(d, opr_sz, simd_maxsz(desc));
954 }
955 
956 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
957                          float_status *fpst, uint32_t desc)
958 {
959     uintptr_t opr_sz = simd_oprsz(desc);
960     float16 *d = vd, *n = vn, *m = vm, *a = va;
961     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
962     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
963     uint32_t neg_real = flip ^ neg_imag;
964     uintptr_t i;
965 
966     /* Shift boolean to the sign bit so we can xor to negate.  */
967     neg_real <<= 15;
968     neg_imag <<= 15;
969 
970     for (i = 0; i < opr_sz / 2; i += 2) {
971         float16 e2 = n[H2(i + flip)];
972         float16 e1 = m[H2(i + flip)] ^ neg_real;
973         float16 e4 = e2;
974         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
975 
976         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
977         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
978     }
979     clear_tail(d, opr_sz, simd_maxsz(desc));
980 }
981 
982 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
983                              float_status *fpst, uint32_t desc)
984 {
985     uintptr_t opr_sz = simd_oprsz(desc);
986     float16 *d = vd, *n = vn, *m = vm, *a = va;
987     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
988     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
989     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
990     uint32_t neg_real = flip ^ neg_imag;
991     intptr_t elements = opr_sz / sizeof(float16);
992     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
993     intptr_t i, j;
994 
995     /* Shift boolean to the sign bit so we can xor to negate.  */
996     neg_real <<= 15;
997     neg_imag <<= 15;
998 
999     for (i = 0; i < elements; i += eltspersegment) {
1000         float16 mr = m[H2(i + 2 * index + 0)];
1001         float16 mi = m[H2(i + 2 * index + 1)];
1002         float16 e1 = neg_real ^ (flip ? mi : mr);
1003         float16 e3 = neg_imag ^ (flip ? mr : mi);
1004 
1005         for (j = i; j < i + eltspersegment; j += 2) {
1006             float16 e2 = n[H2(j + flip)];
1007             float16 e4 = e2;
1008 
1009             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1010             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1011         }
1012     }
1013     clear_tail(d, opr_sz, simd_maxsz(desc));
1014 }
1015 
1016 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1017                          float_status *fpst, uint32_t desc)
1018 {
1019     uintptr_t opr_sz = simd_oprsz(desc);
1020     float32 *d = vd, *n = vn, *m = vm, *a = va;
1021     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1022     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1023     uint32_t neg_real = flip ^ neg_imag;
1024     uintptr_t i;
1025 
1026     /* Shift boolean to the sign bit so we can xor to negate.  */
1027     neg_real <<= 31;
1028     neg_imag <<= 31;
1029 
1030     for (i = 0; i < opr_sz / 4; i += 2) {
1031         float32 e2 = n[H4(i + flip)];
1032         float32 e1 = m[H4(i + flip)] ^ neg_real;
1033         float32 e4 = e2;
1034         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1035 
1036         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1037         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1038     }
1039     clear_tail(d, opr_sz, simd_maxsz(desc));
1040 }
1041 
1042 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1043                              float_status *fpst, uint32_t desc)
1044 {
1045     uintptr_t opr_sz = simd_oprsz(desc);
1046     float32 *d = vd, *n = vn, *m = vm, *a = va;
1047     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1048     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1049     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1050     uint32_t neg_real = flip ^ neg_imag;
1051     intptr_t elements = opr_sz / sizeof(float32);
1052     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1053     intptr_t i, j;
1054 
1055     /* Shift boolean to the sign bit so we can xor to negate.  */
1056     neg_real <<= 31;
1057     neg_imag <<= 31;
1058 
1059     for (i = 0; i < elements; i += eltspersegment) {
1060         float32 mr = m[H4(i + 2 * index + 0)];
1061         float32 mi = m[H4(i + 2 * index + 1)];
1062         float32 e1 = neg_real ^ (flip ? mi : mr);
1063         float32 e3 = neg_imag ^ (flip ? mr : mi);
1064 
1065         for (j = i; j < i + eltspersegment; j += 2) {
1066             float32 e2 = n[H4(j + flip)];
1067             float32 e4 = e2;
1068 
1069             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1070             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1071         }
1072     }
1073     clear_tail(d, opr_sz, simd_maxsz(desc));
1074 }
1075 
1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1077                          float_status *fpst, uint32_t desc)
1078 {
1079     uintptr_t opr_sz = simd_oprsz(desc);
1080     float64 *d = vd, *n = vn, *m = vm, *a = va;
1081     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1082     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1083     uint64_t neg_real = flip ^ neg_imag;
1084     uintptr_t i;
1085 
1086     /* Shift boolean to the sign bit so we can xor to negate.  */
1087     neg_real <<= 63;
1088     neg_imag <<= 63;
1089 
1090     for (i = 0; i < opr_sz / 8; i += 2) {
1091         float64 e2 = n[i + flip];
1092         float64 e1 = m[i + flip] ^ neg_real;
1093         float64 e4 = e2;
1094         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1095 
1096         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1097         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1098     }
1099     clear_tail(d, opr_sz, simd_maxsz(desc));
1100 }
1101 
1102 /*
1103  * Floating point comparisons producing an integer result (all 1s or all 0s).
1104  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1105  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1106  */
1107 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1108 {
1109     return -float16_eq_quiet(op1, op2, stat);
1110 }
1111 
1112 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1113 {
1114     return -float32_eq_quiet(op1, op2, stat);
1115 }
1116 
1117 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1118 {
1119     return -float64_eq_quiet(op1, op2, stat);
1120 }
1121 
1122 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1123 {
1124     return -float16_le(op2, op1, stat);
1125 }
1126 
1127 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1128 {
1129     return -float32_le(op2, op1, stat);
1130 }
1131 
1132 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1133 {
1134     return -float64_le(op2, op1, stat);
1135 }
1136 
1137 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1138 {
1139     return -float16_lt(op2, op1, stat);
1140 }
1141 
1142 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1143 {
1144     return -float32_lt(op2, op1, stat);
1145 }
1146 
1147 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1148 {
1149     return -float64_lt(op2, op1, stat);
1150 }
1151 
1152 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1153 {
1154     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1155 }
1156 
1157 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1158 {
1159     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1160 }
1161 
1162 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1163 {
1164     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1165 }
1166 
1167 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1168 {
1169     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1170 }
1171 
1172 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1173 {
1174     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1175 }
1176 
1177 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1178 {
1179     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1180 }
1181 
1182 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1183 {
1184     if (float16_is_any_nan(x)) {
1185         float_raise(float_flag_invalid, fpst);
1186         return 0;
1187     }
1188     return float16_to_int16_round_to_zero(x, fpst);
1189 }
1190 
1191 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1192 {
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_uint16_round_to_zero(x, fpst);
1198 }
1199 
1200 #define DO_2OP(NAME, FUNC, TYPE) \
1201 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1202 {                                                                 \
1203     intptr_t i, oprsz = simd_oprsz(desc);                         \
1204     TYPE *d = vd, *n = vn;                                        \
1205     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1206         d[i] = FUNC(n[i], stat);                                  \
1207     }                                                             \
1208     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1209 }
1210 
1211 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1212 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1213 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1214 
1215 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1216 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1217 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1218 
1219 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1220 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1221 
1222 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1223 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1224 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1225 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1226 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1227 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1228 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1229 DO_2OP(gvec_touszh, vfp_touszh, float16)
1230 
1231 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1232     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1233     {                                                           \
1234         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1235     }
1236 
1237 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1238     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1239     {                                                           \
1240         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1241     }
1242 
1243 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1244     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1245     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1246     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1247     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1248     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1249     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1250 
1251 DO_2OP_CMP0(cgt, cgt, FWD)
1252 DO_2OP_CMP0(cge, cge, FWD)
1253 DO_2OP_CMP0(ceq, ceq, FWD)
1254 DO_2OP_CMP0(clt, cgt, REV)
1255 DO_2OP_CMP0(cle, cge, REV)
1256 
1257 #undef DO_2OP
1258 #undef DO_2OP_CMP0
1259 
1260 /* Floating-point trigonometric starting value.
1261  * See the ARM ARM pseudocode function FPTrigSMul.
1262  */
1263 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1264 {
1265     float16 result = float16_mul(op1, op1, stat);
1266     if (!float16_is_any_nan(result)) {
1267         result = float16_set_sign(result, op2 & 1);
1268     }
1269     return result;
1270 }
1271 
1272 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1273 {
1274     float32 result = float32_mul(op1, op1, stat);
1275     if (!float32_is_any_nan(result)) {
1276         result = float32_set_sign(result, op2 & 1);
1277     }
1278     return result;
1279 }
1280 
1281 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1282 {
1283     float64 result = float64_mul(op1, op1, stat);
1284     if (!float64_is_any_nan(result)) {
1285         result = float64_set_sign(result, op2 & 1);
1286     }
1287     return result;
1288 }
1289 
1290 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1291 {
1292     return float16_abs(float16_sub(op1, op2, stat));
1293 }
1294 
1295 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1296 {
1297     return float32_abs(float32_sub(op1, op2, stat));
1298 }
1299 
1300 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1301 {
1302     return float64_abs(float64_sub(op1, op2, stat));
1303 }
1304 
1305 /*
1306  * Reciprocal step. These are the AArch32 version which uses a
1307  * non-fused multiply-and-subtract.
1308  */
1309 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1310 {
1311     op1 = float16_squash_input_denormal(op1, stat);
1312     op2 = float16_squash_input_denormal(op2, stat);
1313 
1314     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1315         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1316         return float16_two;
1317     }
1318     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1319 }
1320 
1321 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1322 {
1323     op1 = float32_squash_input_denormal(op1, stat);
1324     op2 = float32_squash_input_denormal(op2, stat);
1325 
1326     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1327         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1328         return float32_two;
1329     }
1330     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1331 }
1332 
1333 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1334 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1335 {
1336     op1 = float16_squash_input_denormal(op1, stat);
1337     op2 = float16_squash_input_denormal(op2, stat);
1338 
1339     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1340         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1341         return float16_one_point_five;
1342     }
1343     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1344     return float16_div(op1, float16_two, stat);
1345 }
1346 
1347 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1348 {
1349     op1 = float32_squash_input_denormal(op1, stat);
1350     op2 = float32_squash_input_denormal(op2, stat);
1351 
1352     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1353         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1354         return float32_one_point_five;
1355     }
1356     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1357     return float32_div(op1, float32_two, stat);
1358 }
1359 
1360 #define DO_3OP(NAME, FUNC, TYPE) \
1361 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1362                   float_status *stat, uint32_t desc)                       \
1363 {                                                                          \
1364     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1365     TYPE *d = vd, *n = vn, *m = vm;                                        \
1366     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1367         d[i] = FUNC(n[i], m[i], stat);                                     \
1368     }                                                                      \
1369     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1370 }
1371 
1372 DO_3OP(gvec_fadd_h, float16_add, float16)
1373 DO_3OP(gvec_fadd_s, float32_add, float32)
1374 DO_3OP(gvec_fadd_d, float64_add, float64)
1375 
1376 DO_3OP(gvec_fsub_h, float16_sub, float16)
1377 DO_3OP(gvec_fsub_s, float32_sub, float32)
1378 DO_3OP(gvec_fsub_d, float64_sub, float64)
1379 
1380 DO_3OP(gvec_fmul_h, float16_mul, float16)
1381 DO_3OP(gvec_fmul_s, float32_mul, float32)
1382 DO_3OP(gvec_fmul_d, float64_mul, float64)
1383 
1384 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1385 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1386 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1387 
1388 DO_3OP(gvec_fabd_h, float16_abd, float16)
1389 DO_3OP(gvec_fabd_s, float32_abd, float32)
1390 DO_3OP(gvec_fabd_d, float64_abd, float64)
1391 
1392 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1393 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1394 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1395 
1396 DO_3OP(gvec_fcge_h, float16_cge, float16)
1397 DO_3OP(gvec_fcge_s, float32_cge, float32)
1398 DO_3OP(gvec_fcge_d, float64_cge, float64)
1399 
1400 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1401 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1402 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1403 
1404 DO_3OP(gvec_facge_h, float16_acge, float16)
1405 DO_3OP(gvec_facge_s, float32_acge, float32)
1406 DO_3OP(gvec_facge_d, float64_acge, float64)
1407 
1408 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1409 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1410 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1411 
1412 DO_3OP(gvec_fmax_h, float16_max, float16)
1413 DO_3OP(gvec_fmax_s, float32_max, float32)
1414 DO_3OP(gvec_fmax_d, float64_max, float64)
1415 
1416 DO_3OP(gvec_fmin_h, float16_min, float16)
1417 DO_3OP(gvec_fmin_s, float32_min, float32)
1418 DO_3OP(gvec_fmin_d, float64_min, float64)
1419 
1420 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1421 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1422 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1423 
1424 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1425 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1426 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1427 
1428 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1429 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1430 
1431 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1432 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1433 
1434 #ifdef TARGET_AARCH64
1435 DO_3OP(gvec_fdiv_h, float16_div, float16)
1436 DO_3OP(gvec_fdiv_s, float32_div, float32)
1437 DO_3OP(gvec_fdiv_d, float64_div, float64)
1438 
1439 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1440 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1441 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1442 
1443 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1444 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1445 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1446 
1447 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1448 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1449 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1450 
1451 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1452 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1453 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1454 
1455 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1456 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1457 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1458 
1459 #endif
1460 #undef DO_3OP
1461 
1462 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1463 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1464                                  float_status *stat)
1465 {
1466     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1467 }
1468 
1469 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1470                                  float_status *stat)
1471 {
1472     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1473 }
1474 
1475 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1476                                  float_status *stat)
1477 {
1478     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1479 }
1480 
1481 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1482                                  float_status *stat)
1483 {
1484     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1485 }
1486 
1487 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1488 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1489                                 float_status *stat)
1490 {
1491     return float16_muladd(op1, op2, dest, 0, stat);
1492 }
1493 
1494 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1495                                  float_status *stat)
1496 {
1497     return float32_muladd(op1, op2, dest, 0, stat);
1498 }
1499 
1500 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1501                                  float_status *stat)
1502 {
1503     return float64_muladd(op1, op2, dest, 0, stat);
1504 }
1505 
1506 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1507                                  float_status *stat)
1508 {
1509     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1510 }
1511 
1512 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1513                                  float_status *stat)
1514 {
1515     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1516 }
1517 
1518 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1519                                  float_status *stat)
1520 {
1521     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1522 }
1523 
1524 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1525 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1526                   float_status *stat, uint32_t desc)                       \
1527 {                                                                          \
1528     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1529     TYPE *d = vd, *n = vn, *m = vm;                                        \
1530     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1531         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1532     }                                                                      \
1533     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1534 }
1535 
1536 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1537 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1538 
1539 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1540 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1541 
1542 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1543 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1544 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1545 
1546 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1547 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1548 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1549 
1550 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1551  * For AdvSIMD, there is of course only one such vector segment.
1552  */
1553 
1554 #define DO_MUL_IDX(NAME, TYPE, H) \
1555 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1556 {                                                                          \
1557     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1558     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1559     intptr_t idx = simd_data(desc);                                        \
1560     TYPE *d = vd, *n = vn, *m = vm;                                        \
1561     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1562         TYPE mm = m[H(i + idx)];                                           \
1563         for (j = 0; j < segment; j++) {                                    \
1564             d[i + j] = n[i + j] * mm;                                      \
1565         }                                                                  \
1566     }                                                                      \
1567     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1568 }
1569 
1570 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1571 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1572 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1573 
1574 #undef DO_MUL_IDX
1575 
1576 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1577 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1578 {                                                                          \
1579     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1580     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1581     intptr_t idx = simd_data(desc);                                        \
1582     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1583     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1584         TYPE mm = m[H(i + idx)];                                           \
1585         for (j = 0; j < segment; j++) {                                    \
1586             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1587         }                                                                  \
1588     }                                                                      \
1589     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1590 }
1591 
1592 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1593 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1594 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1595 
1596 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1597 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1598 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1599 
1600 #undef DO_MLA_IDX
1601 
1602 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1603 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1604                   float_status *stat, uint32_t desc)                       \
1605 {                                                                          \
1606     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1607     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1608     intptr_t idx = simd_data(desc);                                        \
1609     TYPE *d = vd, *n = vn, *m = vm;                                        \
1610     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1611         TYPE mm = m[H(i + idx)];                                           \
1612         for (j = 0; j < segment; j++) {                                    \
1613             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1614         }                                                                  \
1615     }                                                                      \
1616     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1617 }
1618 
1619 #define nop(N, M, S) (M)
1620 
1621 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1622 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1623 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1624 
1625 #ifdef TARGET_AARCH64
1626 
1627 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1628 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1629 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1630 
1631 #endif
1632 
1633 #undef nop
1634 
1635 /*
1636  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1637  * the fused ops below they assume accumulate both from and into Vd.
1638  */
1639 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1640 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1641 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1642 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1643 
1644 #undef DO_FMUL_IDX
1645 
1646 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1647 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1648                   float_status *stat, uint32_t desc)                       \
1649 {                                                                          \
1650     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1651     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1652     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1653     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1654     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1655     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1656     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1657         TYPE mm = m[H(i + idx)];                                           \
1658         for (j = 0; j < segment; j++) {                                    \
1659             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1660                                      mm, a[i + j], 0, stat);               \
1661         }                                                                  \
1662     }                                                                      \
1663     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1664 }
1665 
1666 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1667 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1668 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1669 
1670 #undef DO_FMLA_IDX
1671 
1672 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1673 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1674 {                                                                          \
1675     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1676     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1677     bool q = false;                                                        \
1678     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1679         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1680         if (dd < MIN) {                                                    \
1681             dd = MIN;                                                      \
1682             q = true;                                                      \
1683         } else if (dd > MAX) {                                             \
1684             dd = MAX;                                                      \
1685             q = true;                                                      \
1686         }                                                                  \
1687         d[i] = dd;                                                         \
1688     }                                                                      \
1689     if (q) {                                                               \
1690         uint32_t *qc = vq;                                                 \
1691         qc[0] = 1;                                                         \
1692     }                                                                      \
1693     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1694 }
1695 
1696 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1697 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1698 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1699 
1700 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1701 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1702 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1703 
1704 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1705 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1706 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1707 
1708 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1709 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1710 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1711 
1712 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1713 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1714 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1715 
1716 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1717 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1718 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1719 
1720 #undef DO_SAT
1721 
1722 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1723                           void *vm, uint32_t desc)
1724 {
1725     intptr_t i, oprsz = simd_oprsz(desc);
1726     uint64_t *d = vd, *n = vn, *m = vm;
1727     bool q = false;
1728 
1729     for (i = 0; i < oprsz / 8; i++) {
1730         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1731         if (dd < nn) {
1732             dd = UINT64_MAX;
1733             q = true;
1734         }
1735         d[i] = dd;
1736     }
1737     if (q) {
1738         uint32_t *qc = vq;
1739         qc[0] = 1;
1740     }
1741     clear_tail(d, oprsz, simd_maxsz(desc));
1742 }
1743 
1744 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1745                           void *vm, uint32_t desc)
1746 {
1747     intptr_t i, oprsz = simd_oprsz(desc);
1748     uint64_t *d = vd, *n = vn, *m = vm;
1749     bool q = false;
1750 
1751     for (i = 0; i < oprsz / 8; i++) {
1752         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1753         if (nn < mm) {
1754             dd = 0;
1755             q = true;
1756         }
1757         d[i] = dd;
1758     }
1759     if (q) {
1760         uint32_t *qc = vq;
1761         qc[0] = 1;
1762     }
1763     clear_tail(d, oprsz, simd_maxsz(desc));
1764 }
1765 
1766 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1767                           void *vm, uint32_t desc)
1768 {
1769     intptr_t i, oprsz = simd_oprsz(desc);
1770     int64_t *d = vd, *n = vn, *m = vm;
1771     bool q = false;
1772 
1773     for (i = 0; i < oprsz / 8; i++) {
1774         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1775         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1776             dd = (nn >> 63) ^ ~INT64_MIN;
1777             q = true;
1778         }
1779         d[i] = dd;
1780     }
1781     if (q) {
1782         uint32_t *qc = vq;
1783         qc[0] = 1;
1784     }
1785     clear_tail(d, oprsz, simd_maxsz(desc));
1786 }
1787 
1788 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1789                           void *vm, uint32_t desc)
1790 {
1791     intptr_t i, oprsz = simd_oprsz(desc);
1792     int64_t *d = vd, *n = vn, *m = vm;
1793     bool q = false;
1794 
1795     for (i = 0; i < oprsz / 8; i++) {
1796         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1797         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1798             dd = (nn >> 63) ^ ~INT64_MIN;
1799             q = true;
1800         }
1801         d[i] = dd;
1802     }
1803     if (q) {
1804         uint32_t *qc = vq;
1805         qc[0] = 1;
1806     }
1807     clear_tail(d, oprsz, simd_maxsz(desc));
1808 }
1809 
1810 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1811                            void *vm, uint32_t desc)
1812 {
1813     intptr_t i, oprsz = simd_oprsz(desc);
1814     uint64_t *d = vd, *n = vn, *m = vm;
1815     bool q = false;
1816 
1817     for (i = 0; i < oprsz / 8; i++) {
1818         uint64_t nn = n[i];
1819         int64_t mm = m[i];
1820         uint64_t dd = nn + mm;
1821 
1822         if (mm < 0) {
1823             if (nn < (uint64_t)-mm) {
1824                 dd = 0;
1825                 q = true;
1826             }
1827         } else {
1828             if (dd < nn) {
1829                 dd = UINT64_MAX;
1830                 q = true;
1831             }
1832         }
1833         d[i] = dd;
1834     }
1835     if (q) {
1836         uint32_t *qc = vq;
1837         qc[0] = 1;
1838     }
1839     clear_tail(d, oprsz, simd_maxsz(desc));
1840 }
1841 
1842 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1843                            void *vm, uint32_t desc)
1844 {
1845     intptr_t i, oprsz = simd_oprsz(desc);
1846     uint64_t *d = vd, *n = vn, *m = vm;
1847     bool q = false;
1848 
1849     for (i = 0; i < oprsz / 8; i++) {
1850         int64_t nn = n[i];
1851         uint64_t mm = m[i];
1852         int64_t dd = nn + mm;
1853 
1854         if (mm > (uint64_t)(INT64_MAX - nn)) {
1855             dd = INT64_MAX;
1856             q = true;
1857         }
1858         d[i] = dd;
1859     }
1860     if (q) {
1861         uint32_t *qc = vq;
1862         qc[0] = 1;
1863     }
1864     clear_tail(d, oprsz, simd_maxsz(desc));
1865 }
1866 
1867 #define DO_SRA(NAME, TYPE)                              \
1868 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1869 {                                                       \
1870     intptr_t i, oprsz = simd_oprsz(desc);               \
1871     int shift = simd_data(desc);                        \
1872     TYPE *d = vd, *n = vn;                              \
1873     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1874         d[i] += n[i] >> shift;                          \
1875     }                                                   \
1876     clear_tail(d, oprsz, simd_maxsz(desc));             \
1877 }
1878 
1879 DO_SRA(gvec_ssra_b, int8_t)
1880 DO_SRA(gvec_ssra_h, int16_t)
1881 DO_SRA(gvec_ssra_s, int32_t)
1882 DO_SRA(gvec_ssra_d, int64_t)
1883 
1884 DO_SRA(gvec_usra_b, uint8_t)
1885 DO_SRA(gvec_usra_h, uint16_t)
1886 DO_SRA(gvec_usra_s, uint32_t)
1887 DO_SRA(gvec_usra_d, uint64_t)
1888 
1889 #undef DO_SRA
1890 
1891 #define DO_RSHR(NAME, TYPE)                             \
1892 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1893 {                                                       \
1894     intptr_t i, oprsz = simd_oprsz(desc);               \
1895     int shift = simd_data(desc);                        \
1896     TYPE *d = vd, *n = vn;                              \
1897     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1898         TYPE tmp = n[i] >> (shift - 1);                 \
1899         d[i] = (tmp >> 1) + (tmp & 1);                  \
1900     }                                                   \
1901     clear_tail(d, oprsz, simd_maxsz(desc));             \
1902 }
1903 
1904 DO_RSHR(gvec_srshr_b, int8_t)
1905 DO_RSHR(gvec_srshr_h, int16_t)
1906 DO_RSHR(gvec_srshr_s, int32_t)
1907 DO_RSHR(gvec_srshr_d, int64_t)
1908 
1909 DO_RSHR(gvec_urshr_b, uint8_t)
1910 DO_RSHR(gvec_urshr_h, uint16_t)
1911 DO_RSHR(gvec_urshr_s, uint32_t)
1912 DO_RSHR(gvec_urshr_d, uint64_t)
1913 
1914 #undef DO_RSHR
1915 
1916 #define DO_RSRA(NAME, TYPE)                             \
1917 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1918 {                                                       \
1919     intptr_t i, oprsz = simd_oprsz(desc);               \
1920     int shift = simd_data(desc);                        \
1921     TYPE *d = vd, *n = vn;                              \
1922     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1923         TYPE tmp = n[i] >> (shift - 1);                 \
1924         d[i] += (tmp >> 1) + (tmp & 1);                 \
1925     }                                                   \
1926     clear_tail(d, oprsz, simd_maxsz(desc));             \
1927 }
1928 
1929 DO_RSRA(gvec_srsra_b, int8_t)
1930 DO_RSRA(gvec_srsra_h, int16_t)
1931 DO_RSRA(gvec_srsra_s, int32_t)
1932 DO_RSRA(gvec_srsra_d, int64_t)
1933 
1934 DO_RSRA(gvec_ursra_b, uint8_t)
1935 DO_RSRA(gvec_ursra_h, uint16_t)
1936 DO_RSRA(gvec_ursra_s, uint32_t)
1937 DO_RSRA(gvec_ursra_d, uint64_t)
1938 
1939 #undef DO_RSRA
1940 
1941 #define DO_SRI(NAME, TYPE)                              \
1942 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1943 {                                                       \
1944     intptr_t i, oprsz = simd_oprsz(desc);               \
1945     int shift = simd_data(desc);                        \
1946     TYPE *d = vd, *n = vn;                              \
1947     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1948         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1949     }                                                   \
1950     clear_tail(d, oprsz, simd_maxsz(desc));             \
1951 }
1952 
1953 DO_SRI(gvec_sri_b, uint8_t)
1954 DO_SRI(gvec_sri_h, uint16_t)
1955 DO_SRI(gvec_sri_s, uint32_t)
1956 DO_SRI(gvec_sri_d, uint64_t)
1957 
1958 #undef DO_SRI
1959 
1960 #define DO_SLI(NAME, TYPE)                              \
1961 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1962 {                                                       \
1963     intptr_t i, oprsz = simd_oprsz(desc);               \
1964     int shift = simd_data(desc);                        \
1965     TYPE *d = vd, *n = vn;                              \
1966     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1967         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1968     }                                                   \
1969     clear_tail(d, oprsz, simd_maxsz(desc));             \
1970 }
1971 
1972 DO_SLI(gvec_sli_b, uint8_t)
1973 DO_SLI(gvec_sli_h, uint16_t)
1974 DO_SLI(gvec_sli_s, uint32_t)
1975 DO_SLI(gvec_sli_d, uint64_t)
1976 
1977 #undef DO_SLI
1978 
1979 /*
1980  * Convert float16 to float32, raising no exceptions and
1981  * preserving exceptional values, including SNaN.
1982  * This is effectively an unpack+repack operation.
1983  */
1984 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1985 {
1986     const int f16_bias = 15;
1987     const int f32_bias = 127;
1988     uint32_t sign = extract32(f16, 15, 1);
1989     uint32_t exp = extract32(f16, 10, 5);
1990     uint32_t frac = extract32(f16, 0, 10);
1991 
1992     if (exp == 0x1f) {
1993         /* Inf or NaN */
1994         exp = 0xff;
1995     } else if (exp == 0) {
1996         /* Zero or denormal.  */
1997         if (frac != 0) {
1998             if (fz16) {
1999                 frac = 0;
2000             } else {
2001                 /*
2002                  * Denormal; these are all normal float32.
2003                  * Shift the fraction so that the msb is at bit 11,
2004                  * then remove bit 11 as the implicit bit of the
2005                  * normalized float32.  Note that we still go through
2006                  * the shift for normal numbers below, to put the
2007                  * float32 fraction at the right place.
2008                  */
2009                 int shift = clz32(frac) - 21;
2010                 frac = (frac << shift) & 0x3ff;
2011                 exp = f32_bias - f16_bias - shift + 1;
2012             }
2013         }
2014     } else {
2015         /* Normal number; adjust the bias.  */
2016         exp += f32_bias - f16_bias;
2017     }
2018     sign <<= 31;
2019     exp <<= 23;
2020     frac <<= 23 - 10;
2021 
2022     return sign | exp | frac;
2023 }
2024 
2025 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2026 {
2027     /*
2028      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2029      * Load the 2nd qword iff is_q & is_2.
2030      * Shift to the 2nd dword iff !is_q & is_2.
2031      * For !is_q & !is_2, the upper bits of the result are garbage.
2032      */
2033     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2034 }
2035 
2036 /*
2037  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2038  * as there is not yet SVE versions that might use blocking.
2039  */
2040 
2041 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2042                      uint32_t desc, bool fz16)
2043 {
2044     intptr_t i, oprsz = simd_oprsz(desc);
2045     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2046     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2047     int is_q = oprsz == 16;
2048     uint64_t n_4, m_4;
2049 
2050     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2051     n_4 = load4_f16(vn, is_q, is_2);
2052     m_4 = load4_f16(vm, is_q, is_2);
2053 
2054     /* Negate all inputs for FMLSL at once.  */
2055     if (is_s) {
2056         n_4 ^= 0x8000800080008000ull;
2057     }
2058 
2059     for (i = 0; i < oprsz / 4; i++) {
2060         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2061         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2062         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2063     }
2064     clear_tail(d, oprsz, simd_maxsz(desc));
2065 }
2066 
2067 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2068                             CPUARMState *env, uint32_t desc)
2069 {
2070     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2071              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2072 }
2073 
2074 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2075                             CPUARMState *env, uint32_t desc)
2076 {
2077     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2078              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2079 }
2080 
2081 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2082                                CPUARMState *env, uint32_t desc)
2083 {
2084     intptr_t i, oprsz = simd_oprsz(desc);
2085     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2086     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2087     float_status *status = &env->vfp.fp_status_a64;
2088     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2089 
2090     for (i = 0; i < oprsz; i += sizeof(float32)) {
2091         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2092         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2093         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2094         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2095         float32 aa = *(float32 *)(va + H1_4(i));
2096 
2097         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2098     }
2099 }
2100 
2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2102                          uint32_t desc, bool fz16)
2103 {
2104     intptr_t i, oprsz = simd_oprsz(desc);
2105     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2106     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2107     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2108     int is_q = oprsz == 16;
2109     uint64_t n_4;
2110     float32 m_1;
2111 
2112     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2113     n_4 = load4_f16(vn, is_q, is_2);
2114 
2115     /* Negate all inputs for FMLSL at once.  */
2116     if (is_s) {
2117         n_4 ^= 0x8000800080008000ull;
2118     }
2119 
2120     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2121 
2122     for (i = 0; i < oprsz / 4; i++) {
2123         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2124         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2125     }
2126     clear_tail(d, oprsz, simd_maxsz(desc));
2127 }
2128 
2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2130                                 CPUARMState *env, uint32_t desc)
2131 {
2132     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2133                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2134 }
2135 
2136 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2137                                 CPUARMState *env, uint32_t desc)
2138 {
2139     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2140                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2141 }
2142 
2143 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2144                                CPUARMState *env, uint32_t desc)
2145 {
2146     intptr_t i, j, oprsz = simd_oprsz(desc);
2147     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2148     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2149     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2150     float_status *status = &env->vfp.fp_status_a64;
2151     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2152 
2153     for (i = 0; i < oprsz; i += 16) {
2154         float16 mm_16 = *(float16 *)(vm + i + idx);
2155         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2156 
2157         for (j = 0; j < 16; j += sizeof(float32)) {
2158             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2159             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2160             float32 aa = *(float32 *)(va + H1_4(i + j));
2161 
2162             *(float32 *)(vd + H1_4(i + j)) =
2163                 float32_muladd(nn, mm, aa, 0, status);
2164         }
2165     }
2166 }
2167 
2168 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2169 {
2170     intptr_t i, opr_sz = simd_oprsz(desc);
2171     int8_t *d = vd, *n = vn, *m = vm;
2172 
2173     for (i = 0; i < opr_sz; ++i) {
2174         int8_t mm = m[i];
2175         int8_t nn = n[i];
2176         int8_t res = 0;
2177         if (mm >= 0) {
2178             if (mm < 8) {
2179                 res = nn << mm;
2180             }
2181         } else {
2182             res = nn >> (mm > -8 ? -mm : 7);
2183         }
2184         d[i] = res;
2185     }
2186     clear_tail(d, opr_sz, simd_maxsz(desc));
2187 }
2188 
2189 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2190 {
2191     intptr_t i, opr_sz = simd_oprsz(desc);
2192     int16_t *d = vd, *n = vn, *m = vm;
2193 
2194     for (i = 0; i < opr_sz / 2; ++i) {
2195         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2196         int16_t nn = n[i];
2197         int16_t res = 0;
2198         if (mm >= 0) {
2199             if (mm < 16) {
2200                 res = nn << mm;
2201             }
2202         } else {
2203             res = nn >> (mm > -16 ? -mm : 15);
2204         }
2205         d[i] = res;
2206     }
2207     clear_tail(d, opr_sz, simd_maxsz(desc));
2208 }
2209 
2210 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2211 {
2212     intptr_t i, opr_sz = simd_oprsz(desc);
2213     uint8_t *d = vd, *n = vn, *m = vm;
2214 
2215     for (i = 0; i < opr_sz; ++i) {
2216         int8_t mm = m[i];
2217         uint8_t nn = n[i];
2218         uint8_t res = 0;
2219         if (mm >= 0) {
2220             if (mm < 8) {
2221                 res = nn << mm;
2222             }
2223         } else {
2224             if (mm > -8) {
2225                 res = nn >> -mm;
2226             }
2227         }
2228         d[i] = res;
2229     }
2230     clear_tail(d, opr_sz, simd_maxsz(desc));
2231 }
2232 
2233 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2234 {
2235     intptr_t i, opr_sz = simd_oprsz(desc);
2236     uint16_t *d = vd, *n = vn, *m = vm;
2237 
2238     for (i = 0; i < opr_sz / 2; ++i) {
2239         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2240         uint16_t nn = n[i];
2241         uint16_t res = 0;
2242         if (mm >= 0) {
2243             if (mm < 16) {
2244                 res = nn << mm;
2245             }
2246         } else {
2247             if (mm > -16) {
2248                 res = nn >> -mm;
2249             }
2250         }
2251         d[i] = res;
2252     }
2253     clear_tail(d, opr_sz, simd_maxsz(desc));
2254 }
2255 
2256 /*
2257  * 8x8->8 polynomial multiply.
2258  *
2259  * Polynomial multiplication is like integer multiplication except the
2260  * partial products are XORed, not added.
2261  *
2262  * TODO: expose this as a generic vector operation, as it is a common
2263  * crypto building block.
2264  */
2265 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2266 {
2267     intptr_t i, opr_sz = simd_oprsz(desc);
2268     uint64_t *d = vd, *n = vn, *m = vm;
2269 
2270     for (i = 0; i < opr_sz / 8; ++i) {
2271         d[i] = clmul_8x8_low(n[i], m[i]);
2272     }
2273     clear_tail(d, opr_sz, simd_maxsz(desc));
2274 }
2275 
2276 /*
2277  * 64x64->128 polynomial multiply.
2278  * Because of the lanes are not accessed in strict columns,
2279  * this probably cannot be turned into a generic helper.
2280  */
2281 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2282 {
2283     intptr_t i, opr_sz = simd_oprsz(desc);
2284     intptr_t hi = simd_data(desc);
2285     uint64_t *d = vd, *n = vn, *m = vm;
2286 
2287     for (i = 0; i < opr_sz / 8; i += 2) {
2288         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2289         d[i] = int128_getlo(r);
2290         d[i + 1] = int128_gethi(r);
2291     }
2292     clear_tail(d, opr_sz, simd_maxsz(desc));
2293 }
2294 
2295 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2296 {
2297     int hi = simd_data(desc);
2298     uint64_t *d = vd, *n = vn, *m = vm;
2299     uint64_t nn = n[hi], mm = m[hi];
2300 
2301     d[0] = clmul_8x4_packed(nn, mm);
2302     nn >>= 32;
2303     mm >>= 32;
2304     d[1] = clmul_8x4_packed(nn, mm);
2305 
2306     clear_tail(d, 16, simd_maxsz(desc));
2307 }
2308 
2309 #ifdef TARGET_AARCH64
2310 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2311 {
2312     int shift = simd_data(desc) * 8;
2313     intptr_t i, opr_sz = simd_oprsz(desc);
2314     uint64_t *d = vd, *n = vn, *m = vm;
2315 
2316     for (i = 0; i < opr_sz / 8; ++i) {
2317         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2318     }
2319 }
2320 
2321 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2322 {
2323     intptr_t sel = H4(simd_data(desc));
2324     intptr_t i, opr_sz = simd_oprsz(desc);
2325     uint32_t *n = vn, *m = vm;
2326     uint64_t *d = vd;
2327 
2328     for (i = 0; i < opr_sz / 8; ++i) {
2329         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2330     }
2331 }
2332 #endif
2333 
2334 #define DO_CMP0(NAME, TYPE, OP)                         \
2335 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2336 {                                                       \
2337     intptr_t i, opr_sz = simd_oprsz(desc);              \
2338     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2339         TYPE nn = *(TYPE *)(vn + i);                    \
2340         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2341     }                                                   \
2342     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2343 }
2344 
2345 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2346 DO_CMP0(gvec_clt0_b, int8_t, <)
2347 DO_CMP0(gvec_cle0_b, int8_t, <=)
2348 DO_CMP0(gvec_cgt0_b, int8_t, >)
2349 DO_CMP0(gvec_cge0_b, int8_t, >=)
2350 
2351 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2352 DO_CMP0(gvec_clt0_h, int16_t, <)
2353 DO_CMP0(gvec_cle0_h, int16_t, <=)
2354 DO_CMP0(gvec_cgt0_h, int16_t, >)
2355 DO_CMP0(gvec_cge0_h, int16_t, >=)
2356 
2357 #undef DO_CMP0
2358 
2359 #define DO_ABD(NAME, TYPE)                                      \
2360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2361 {                                                               \
2362     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2363     TYPE *d = vd, *n = vn, *m = vm;                             \
2364                                                                 \
2365     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2366         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2367     }                                                           \
2368     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2369 }
2370 
2371 DO_ABD(gvec_sabd_b, int8_t)
2372 DO_ABD(gvec_sabd_h, int16_t)
2373 DO_ABD(gvec_sabd_s, int32_t)
2374 DO_ABD(gvec_sabd_d, int64_t)
2375 
2376 DO_ABD(gvec_uabd_b, uint8_t)
2377 DO_ABD(gvec_uabd_h, uint16_t)
2378 DO_ABD(gvec_uabd_s, uint32_t)
2379 DO_ABD(gvec_uabd_d, uint64_t)
2380 
2381 #undef DO_ABD
2382 
2383 #define DO_ABA(NAME, TYPE)                                      \
2384 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2385 {                                                               \
2386     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2387     TYPE *d = vd, *n = vn, *m = vm;                             \
2388                                                                 \
2389     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2390         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2391     }                                                           \
2392     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2393 }
2394 
2395 DO_ABA(gvec_saba_b, int8_t)
2396 DO_ABA(gvec_saba_h, int16_t)
2397 DO_ABA(gvec_saba_s, int32_t)
2398 DO_ABA(gvec_saba_d, int64_t)
2399 
2400 DO_ABA(gvec_uaba_b, uint8_t)
2401 DO_ABA(gvec_uaba_h, uint16_t)
2402 DO_ABA(gvec_uaba_s, uint32_t)
2403 DO_ABA(gvec_uaba_d, uint64_t)
2404 
2405 #undef DO_ABA
2406 
2407 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2408 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2409                   float_status *stat, uint32_t desc)                       \
2410 {                                                                          \
2411     ARMVectorReg scratch;                                                  \
2412     intptr_t oprsz = simd_oprsz(desc);                                     \
2413     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2414     TYPE *d = vd, *n = vn, *m = vm;                                        \
2415     if (unlikely(d == m)) {                                                \
2416         m = memcpy(&scratch, m, oprsz);                                    \
2417     }                                                                      \
2418     for (intptr_t i = 0; i < half; ++i) {                                  \
2419         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2420     }                                                                      \
2421     for (intptr_t i = 0; i < half; ++i) {                                  \
2422         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2423     }                                                                      \
2424     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2425 }
2426 
2427 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2428 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2429 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2430 
2431 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2432 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2433 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2434 
2435 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2436 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2437 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2438 
2439 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2440 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2441 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2442 
2443 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2444 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2445 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2446 
2447 #undef DO_3OP_PAIR
2448 
2449 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2450 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2451 {                                                               \
2452     ARMVectorReg scratch;                                       \
2453     intptr_t oprsz = simd_oprsz(desc);                          \
2454     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2455     TYPE *d = vd, *n = vn, *m = vm;                             \
2456     if (unlikely(d == m)) {                                     \
2457         m = memcpy(&scratch, m, oprsz);                         \
2458     }                                                           \
2459     for (intptr_t i = 0; i < half; ++i) {                       \
2460         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2461     }                                                           \
2462     for (intptr_t i = 0; i < half; ++i) {                       \
2463         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2464     }                                                           \
2465     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2466 }
2467 
2468 #define ADD(A, B) (A + B)
2469 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2470 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2471 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2472 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2473 #undef  ADD
2474 
2475 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2476 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2477 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2478 
2479 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2480 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2481 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2482 
2483 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2484 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2485 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2486 
2487 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2488 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2489 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2490 
2491 #undef DO_3OP_PAIR
2492 
2493 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2494     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2495     {                                                                   \
2496         intptr_t i, oprsz = simd_oprsz(desc);                           \
2497         int shift = simd_data(desc);                                    \
2498         TYPE *d = vd, *n = vn;                                          \
2499         float_status *fpst = stat;                                      \
2500         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2501             d[i] = FUNC(n[i], shift, fpst);                             \
2502         }                                                               \
2503         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2504     }
2505 
2506 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2507 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2508 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2509 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2510 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2511 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2512 
2513 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2514 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2515 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2516 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2517 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2518 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2519 
2520 #undef DO_VCVT_FIXED
2521 
2522 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2523     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2524     {                                                                   \
2525         intptr_t i, oprsz = simd_oprsz(desc);                           \
2526         uint32_t rmode = simd_data(desc);                               \
2527         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2528         TYPE *d = vd, *n = vn;                                          \
2529         set_float_rounding_mode(rmode, fpst);                           \
2530         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2531             d[i] = FUNC(n[i], 0, fpst);                                 \
2532         }                                                               \
2533         set_float_rounding_mode(prev_rmode, fpst);                      \
2534         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2535     }
2536 
2537 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2538 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2539 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2540 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2541 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2542 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2543 
2544 #undef DO_VCVT_RMODE
2545 
2546 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2547     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2548     {                                                                   \
2549         intptr_t i, oprsz = simd_oprsz(desc);                           \
2550         uint32_t rmode = simd_data(desc);                               \
2551         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2552         TYPE *d = vd, *n = vn;                                          \
2553         set_float_rounding_mode(rmode, fpst);                           \
2554         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2555             d[i] = FUNC(n[i], fpst);                                    \
2556         }                                                               \
2557         set_float_rounding_mode(prev_rmode, fpst);                      \
2558         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2559     }
2560 
2561 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2562 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2563 
2564 #undef DO_VRINT_RMODE
2565 
2566 #ifdef TARGET_AARCH64
2567 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2568 {
2569     const uint8_t *indices = vm;
2570     size_t oprsz = simd_oprsz(desc);
2571     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2572     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2573     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2574     union {
2575         uint8_t b[16];
2576         uint64_t d[2];
2577     } result;
2578 
2579     /*
2580      * We must construct the final result in a temp, lest the output
2581      * overlaps the input table.  For TBL, begin with zero; for TBX,
2582      * begin with the original register contents.  Note that we always
2583      * copy 16 bytes here to avoid an extra branch; clearing the high
2584      * bits of the register for oprsz == 8 is handled below.
2585      */
2586     if (is_tbx) {
2587         memcpy(&result, vd, 16);
2588     } else {
2589         memset(&result, 0, 16);
2590     }
2591 
2592     for (size_t i = 0; i < oprsz; ++i) {
2593         uint32_t index = indices[H1(i)];
2594 
2595         if (index < table_len) {
2596             /*
2597              * Convert index (a byte offset into the virtual table
2598              * which is a series of 128-bit vectors concatenated)
2599              * into the correct register element, bearing in mind
2600              * that the table can wrap around from V31 to V0.
2601              */
2602             const uint8_t *table = (const uint8_t *)
2603                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2604             result.b[H1(i)] = table[H1(index % 16)];
2605         }
2606     }
2607 
2608     memcpy(vd, &result, 16);
2609     clear_tail(vd, oprsz, simd_maxsz(desc));
2610 }
2611 #endif
2612 
2613 /*
2614  * NxN -> N highpart multiply
2615  *
2616  * TODO: expose this as a generic vector operation.
2617  */
2618 
2619 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2620 {
2621     intptr_t i, opr_sz = simd_oprsz(desc);
2622     int8_t *d = vd, *n = vn, *m = vm;
2623 
2624     for (i = 0; i < opr_sz; ++i) {
2625         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2626     }
2627     clear_tail(d, opr_sz, simd_maxsz(desc));
2628 }
2629 
2630 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2631 {
2632     intptr_t i, opr_sz = simd_oprsz(desc);
2633     int16_t *d = vd, *n = vn, *m = vm;
2634 
2635     for (i = 0; i < opr_sz / 2; ++i) {
2636         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2637     }
2638     clear_tail(d, opr_sz, simd_maxsz(desc));
2639 }
2640 
2641 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2642 {
2643     intptr_t i, opr_sz = simd_oprsz(desc);
2644     int32_t *d = vd, *n = vn, *m = vm;
2645 
2646     for (i = 0; i < opr_sz / 4; ++i) {
2647         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2648     }
2649     clear_tail(d, opr_sz, simd_maxsz(desc));
2650 }
2651 
2652 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2653 {
2654     intptr_t i, opr_sz = simd_oprsz(desc);
2655     uint64_t *d = vd, *n = vn, *m = vm;
2656     uint64_t discard;
2657 
2658     for (i = 0; i < opr_sz / 8; ++i) {
2659         muls64(&discard, &d[i], n[i], m[i]);
2660     }
2661     clear_tail(d, opr_sz, simd_maxsz(desc));
2662 }
2663 
2664 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2665 {
2666     intptr_t i, opr_sz = simd_oprsz(desc);
2667     uint8_t *d = vd, *n = vn, *m = vm;
2668 
2669     for (i = 0; i < opr_sz; ++i) {
2670         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2671     }
2672     clear_tail(d, opr_sz, simd_maxsz(desc));
2673 }
2674 
2675 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2676 {
2677     intptr_t i, opr_sz = simd_oprsz(desc);
2678     uint16_t *d = vd, *n = vn, *m = vm;
2679 
2680     for (i = 0; i < opr_sz / 2; ++i) {
2681         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2682     }
2683     clear_tail(d, opr_sz, simd_maxsz(desc));
2684 }
2685 
2686 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2687 {
2688     intptr_t i, opr_sz = simd_oprsz(desc);
2689     uint32_t *d = vd, *n = vn, *m = vm;
2690 
2691     for (i = 0; i < opr_sz / 4; ++i) {
2692         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2693     }
2694     clear_tail(d, opr_sz, simd_maxsz(desc));
2695 }
2696 
2697 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2698 {
2699     intptr_t i, opr_sz = simd_oprsz(desc);
2700     uint64_t *d = vd, *n = vn, *m = vm;
2701     uint64_t discard;
2702 
2703     for (i = 0; i < opr_sz / 8; ++i) {
2704         mulu64(&discard, &d[i], n[i], m[i]);
2705     }
2706     clear_tail(d, opr_sz, simd_maxsz(desc));
2707 }
2708 
2709 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2710 {
2711     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2712     int shr = simd_data(desc);
2713     uint64_t *d = vd, *n = vn, *m = vm;
2714 
2715     for (i = 0; i < opr_sz; ++i) {
2716         d[i] = ror64(n[i] ^ m[i], shr);
2717     }
2718     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2719 }
2720 
2721 /*
2722  * Integer matrix-multiply accumulate
2723  */
2724 
2725 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2726 {
2727     int8_t *n = vn, *m = vm;
2728 
2729     for (intptr_t k = 0; k < 8; ++k) {
2730         sum += n[H1(k)] * m[H1(k)];
2731     }
2732     return sum;
2733 }
2734 
2735 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2736 {
2737     uint8_t *n = vn, *m = vm;
2738 
2739     for (intptr_t k = 0; k < 8; ++k) {
2740         sum += n[H1(k)] * m[H1(k)];
2741     }
2742     return sum;
2743 }
2744 
2745 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2746 {
2747     uint8_t *n = vn;
2748     int8_t *m = vm;
2749 
2750     for (intptr_t k = 0; k < 8; ++k) {
2751         sum += n[H1(k)] * m[H1(k)];
2752     }
2753     return sum;
2754 }
2755 
2756 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2757                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2758 {
2759     intptr_t seg, opr_sz = simd_oprsz(desc);
2760 
2761     for (seg = 0; seg < opr_sz; seg += 16) {
2762         uint32_t *d = vd + seg;
2763         uint32_t *a = va + seg;
2764         uint32_t sum0, sum1, sum2, sum3;
2765 
2766         /*
2767          * Process the entire segment at once, writing back the
2768          * results only after we've consumed all of the inputs.
2769          *
2770          * Key to indices by column:
2771          *          i   j                  i             j
2772          */
2773         sum0 = a[H4(0 + 0)];
2774         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2775         sum1 = a[H4(0 + 1)];
2776         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2777         sum2 = a[H4(2 + 0)];
2778         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2779         sum3 = a[H4(2 + 1)];
2780         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2781 
2782         d[H4(0)] = sum0;
2783         d[H4(1)] = sum1;
2784         d[H4(2)] = sum2;
2785         d[H4(3)] = sum3;
2786     }
2787     clear_tail(vd, opr_sz, simd_maxsz(desc));
2788 }
2789 
2790 #define DO_MMLA_B(NAME, INNER) \
2791     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2792     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2793 
2794 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2795 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2796 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2797 
2798 /*
2799  * BFloat16 Dot Product
2800  */
2801 
2802 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2803 {
2804     /*
2805      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2806      * For EBF = 0, we ignore the FPCR bits which determine rounding
2807      * mode and denormal-flushing, and we do unfused multiplies and
2808      * additions with intermediate rounding of all products and sums.
2809      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2810      * and we perform a fused two-way sum-of-products without intermediate
2811      * rounding of the products.
2812      * In either case, we don't set fp exception flags.
2813      *
2814      * EBF is AArch64 only, so even if it's set in the FPCR it has
2815      * no effect on AArch32 instructions.
2816      */
2817     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2818 
2819     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2820     set_default_nan_mode(true, statusp);
2821 
2822     if (ebf) {
2823         /* EBF=1 needs to do a step with round-to-odd semantics */
2824         *oddstatusp = *statusp;
2825         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2826     } else {
2827         set_flush_to_zero(true, statusp);
2828         set_flush_inputs_to_zero(true, statusp);
2829         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2830     }
2831     return ebf;
2832 }
2833 
2834 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2835 {
2836     float32 t1, t2;
2837 
2838     /*
2839      * Extract each BFloat16 from the element pair, and shift
2840      * them such that they become float32.
2841      */
2842     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2843     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2844     t1 = float32_add(t1, t2, fpst);
2845     t1 = float32_add(sum, t1, fpst);
2846 
2847     return t1;
2848 }
2849 
2850 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2851                      float_status *fpst, float_status *fpst_odd)
2852 {
2853     /*
2854      * Compare f16_dotadd() in sme_helper.c, but here we have
2855      * bfloat16 inputs. In particular that means that we do not
2856      * want the FPCR.FZ16 flush semantics, so we use the normal
2857      * float_status for the input handling here.
2858      */
2859     float64 e1r = float32_to_float64(e1 << 16, fpst);
2860     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2861     float64 e2r = float32_to_float64(e2 << 16, fpst);
2862     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2863     float64 t64;
2864     float32 t32;
2865 
2866     /*
2867      * The ARM pseudocode function FPDot performs both multiplies
2868      * and the add with a single rounding operation.  Emulate this
2869      * by performing the first multiply in round-to-odd, then doing
2870      * the second multiply as fused multiply-add, and rounding to
2871      * float32 all in one step.
2872      */
2873     t64 = float64_mul(e1r, e2r, fpst_odd);
2874     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2875 
2876     /* This conversion is exact, because we've already rounded. */
2877     t32 = float64_to_float32(t64, fpst);
2878 
2879     /* The final accumulation step is not fused. */
2880     return float32_add(sum, t32, fpst);
2881 }
2882 
2883 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2884                         CPUARMState *env, uint32_t desc)
2885 {
2886     intptr_t i, opr_sz = simd_oprsz(desc);
2887     float32 *d = vd, *a = va;
2888     uint32_t *n = vn, *m = vm;
2889     float_status fpst, fpst_odd;
2890 
2891     if (is_ebf(env, &fpst, &fpst_odd)) {
2892         for (i = 0; i < opr_sz / 4; ++i) {
2893             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2894         }
2895     } else {
2896         for (i = 0; i < opr_sz / 4; ++i) {
2897             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2898         }
2899     }
2900     clear_tail(d, opr_sz, simd_maxsz(desc));
2901 }
2902 
2903 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2904                             void *va, CPUARMState *env, uint32_t desc)
2905 {
2906     intptr_t i, j, opr_sz = simd_oprsz(desc);
2907     intptr_t index = simd_data(desc);
2908     intptr_t elements = opr_sz / 4;
2909     intptr_t eltspersegment = MIN(16 / 4, elements);
2910     float32 *d = vd, *a = va;
2911     uint32_t *n = vn, *m = vm;
2912     float_status fpst, fpst_odd;
2913 
2914     if (is_ebf(env, &fpst, &fpst_odd)) {
2915         for (i = 0; i < elements; i += eltspersegment) {
2916             uint32_t m_idx = m[i + H4(index)];
2917 
2918             for (j = i; j < i + eltspersegment; j++) {
2919                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2920             }
2921         }
2922     } else {
2923         for (i = 0; i < elements; i += eltspersegment) {
2924             uint32_t m_idx = m[i + H4(index)];
2925 
2926             for (j = i; j < i + eltspersegment; j++) {
2927                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2928             }
2929         }
2930     }
2931     clear_tail(d, opr_sz, simd_maxsz(desc));
2932 }
2933 
2934 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2935                          CPUARMState *env, uint32_t desc)
2936 {
2937     intptr_t s, opr_sz = simd_oprsz(desc);
2938     float32 *d = vd, *a = va;
2939     uint32_t *n = vn, *m = vm;
2940     float_status fpst, fpst_odd;
2941 
2942     if (is_ebf(env, &fpst, &fpst_odd)) {
2943         for (s = 0; s < opr_sz / 4; s += 4) {
2944             float32 sum00, sum01, sum10, sum11;
2945 
2946             /*
2947              * Process the entire segment at once, writing back the
2948              * results only after we've consumed all of the inputs.
2949              *
2950              * Key to indices by column:
2951              *               i   j               i   k             j   k
2952              */
2953             sum00 = a[s + H4(0 + 0)];
2954             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2955             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2956 
2957             sum01 = a[s + H4(0 + 1)];
2958             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2959             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2960 
2961             sum10 = a[s + H4(2 + 0)];
2962             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2963             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2964 
2965             sum11 = a[s + H4(2 + 1)];
2966             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2967             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2968 
2969             d[s + H4(0 + 0)] = sum00;
2970             d[s + H4(0 + 1)] = sum01;
2971             d[s + H4(2 + 0)] = sum10;
2972             d[s + H4(2 + 1)] = sum11;
2973         }
2974     } else {
2975         for (s = 0; s < opr_sz / 4; s += 4) {
2976             float32 sum00, sum01, sum10, sum11;
2977 
2978             /*
2979              * Process the entire segment at once, writing back the
2980              * results only after we've consumed all of the inputs.
2981              *
2982              * Key to indices by column:
2983              *               i   j           i   k             j   k
2984              */
2985             sum00 = a[s + H4(0 + 0)];
2986             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
2987             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
2988 
2989             sum01 = a[s + H4(0 + 1)];
2990             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
2991             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
2992 
2993             sum10 = a[s + H4(2 + 0)];
2994             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
2995             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
2996 
2997             sum11 = a[s + H4(2 + 1)];
2998             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
2999             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3000 
3001             d[s + H4(0 + 0)] = sum00;
3002             d[s + H4(0 + 1)] = sum01;
3003             d[s + H4(2 + 0)] = sum10;
3004             d[s + H4(2 + 1)] = sum11;
3005         }
3006     }
3007     clear_tail(d, opr_sz, simd_maxsz(desc));
3008 }
3009 
3010 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3011                          float_status *stat, uint32_t desc)
3012 {
3013     intptr_t i, opr_sz = simd_oprsz(desc);
3014     intptr_t sel = simd_data(desc);
3015     float32 *d = vd, *a = va;
3016     bfloat16 *n = vn, *m = vm;
3017 
3018     for (i = 0; i < opr_sz / 4; ++i) {
3019         float32 nn = n[H2(i * 2 + sel)] << 16;
3020         float32 mm = m[H2(i * 2 + sel)] << 16;
3021         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3022     }
3023     clear_tail(d, opr_sz, simd_maxsz(desc));
3024 }
3025 
3026 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3027                              void *va, float_status *stat, uint32_t desc)
3028 {
3029     intptr_t i, j, opr_sz = simd_oprsz(desc);
3030     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3031     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3032     intptr_t elements = opr_sz / 4;
3033     intptr_t eltspersegment = MIN(16 / 4, elements);
3034     float32 *d = vd, *a = va;
3035     bfloat16 *n = vn, *m = vm;
3036 
3037     for (i = 0; i < elements; i += eltspersegment) {
3038         float32 m_idx = m[H2(2 * i + index)] << 16;
3039 
3040         for (j = i; j < i + eltspersegment; j++) {
3041             float32 n_j = n[H2(2 * j + sel)] << 16;
3042             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3043         }
3044     }
3045     clear_tail(d, opr_sz, simd_maxsz(desc));
3046 }
3047 
3048 #define DO_CLAMP(NAME, TYPE) \
3049 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3050 {                                                                       \
3051     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3052     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3053         TYPE aa = *(TYPE *)(a + i);                                     \
3054         TYPE nn = *(TYPE *)(n + i);                                     \
3055         TYPE mm = *(TYPE *)(m + i);                                     \
3056         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3057         *(TYPE *)(d + i) = dd;                                          \
3058     }                                                                   \
3059     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3060 }
3061 
3062 DO_CLAMP(gvec_sclamp_b, int8_t)
3063 DO_CLAMP(gvec_sclamp_h, int16_t)
3064 DO_CLAMP(gvec_sclamp_s, int32_t)
3065 DO_CLAMP(gvec_sclamp_d, int64_t)
3066 
3067 DO_CLAMP(gvec_uclamp_b, uint8_t)
3068 DO_CLAMP(gvec_uclamp_h, uint16_t)
3069 DO_CLAMP(gvec_uclamp_s, uint32_t)
3070 DO_CLAMP(gvec_uclamp_d, uint64_t)
3071 
3072 /* Bit count in each 8-bit word. */
3073 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3074 {
3075     intptr_t i, opr_sz = simd_oprsz(desc);
3076     uint8_t *d = vd, *n = vn;
3077 
3078     for (i = 0; i < opr_sz; ++i) {
3079         d[i] = ctpop8(n[i]);
3080     }
3081     clear_tail(d, opr_sz, simd_maxsz(desc));
3082 }
3083 
3084 /* Reverse bits in each 8 bit word */
3085 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3086 {
3087     intptr_t i, opr_sz = simd_oprsz(desc);
3088     uint64_t *d = vd, *n = vn;
3089 
3090     for (i = 0; i < opr_sz / 8; ++i) {
3091         d[i] = revbit64(bswap64(n[i]));
3092     }
3093     clear_tail(d, opr_sz, simd_maxsz(desc));
3094 }
3095 
3096 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3097 {
3098     intptr_t i, opr_sz = simd_oprsz(desc);
3099     uint32_t *d = vd, *n = vn;
3100 
3101     for (i = 0; i < opr_sz / 4; ++i) {
3102         d[i] = helper_recpe_u32(n[i]);
3103     }
3104     clear_tail(d, opr_sz, simd_maxsz(desc));
3105 }
3106 
3107 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3108 {
3109     intptr_t i, opr_sz = simd_oprsz(desc);
3110     uint32_t *d = vd, *n = vn;
3111 
3112     for (i = 0; i < opr_sz / 4; ++i) {
3113         d[i] = helper_rsqrte_u32(n[i]);
3114     }
3115     clear_tail(d, opr_sz, simd_maxsz(desc));
3116 }
3117