xref: /qemu/target/arm/tcg/vec_helper.c (revision 72203eefab04a6903328807b0e3c635210031262)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
883     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
884     uintptr_t i;
885 
886     for (i = 0; i < opr_sz / 2; i += 2) {
887         float16 e0 = n[H2(i)];
888         float16 e1 = m[H2(i + 1)];
889         float16 e2 = n[H2(i + 1)];
890         float16 e3 = m[H2(i)];
891 
892         if (rot) {
893             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
894         } else {
895             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
896         }
897 
898         d[H2(i)] = float16_add(e0, e1, fpst);
899         d[H2(i + 1)] = float16_add(e2, e3, fpst);
900     }
901     clear_tail(d, opr_sz, simd_maxsz(desc));
902 }
903 
904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
905                          float_status *fpst, uint32_t desc)
906 {
907     uintptr_t opr_sz = simd_oprsz(desc);
908     float32 *d = vd;
909     float32 *n = vn;
910     float32 *m = vm;
911     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
912     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
913     uintptr_t i;
914 
915     for (i = 0; i < opr_sz / 4; i += 2) {
916         float32 e0 = n[H4(i)];
917         float32 e1 = m[H4(i + 1)];
918         float32 e2 = n[H4(i + 1)];
919         float32 e3 = m[H4(i)];
920 
921         if (rot) {
922             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
923         } else {
924             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
925         }
926 
927         d[H4(i)] = float32_add(e0, e1, fpst);
928         d[H4(i + 1)] = float32_add(e2, e3, fpst);
929     }
930     clear_tail(d, opr_sz, simd_maxsz(desc));
931 }
932 
933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
934                          float_status *fpst, uint32_t desc)
935 {
936     uintptr_t opr_sz = simd_oprsz(desc);
937     float64 *d = vd;
938     float64 *n = vn;
939     float64 *m = vm;
940     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
941     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
942     uintptr_t i;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1];
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i];
949 
950         if (rot) {
951             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
952         } else {
953             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
954         }
955 
956         d[i] = float64_add(e0, e1, fpst);
957         d[i + 1] = float64_add(e2, e3, fpst);
958     }
959     clear_tail(d, opr_sz, simd_maxsz(desc));
960 }
961 
962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
963                          float_status *fpst, uint32_t desc)
964 {
965     uintptr_t opr_sz = simd_oprsz(desc);
966     float16 *d = vd, *n = vn, *m = vm, *a = va;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
969     uint32_t neg_real = flip ^ neg_imag;
970     uintptr_t i;
971 
972     /* Shift boolean to the sign bit so we can xor to negate.  */
973     neg_real <<= 15;
974     neg_imag <<= 15;
975 
976     for (i = 0; i < opr_sz / 2; i += 2) {
977         float16 e2 = n[H2(i + flip)];
978         float16 e1 = m[H2(i + flip)] ^ neg_real;
979         float16 e4 = e2;
980         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
981 
982         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
983         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
984     }
985     clear_tail(d, opr_sz, simd_maxsz(desc));
986 }
987 
988 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
989                              float_status *fpst, uint32_t desc)
990 {
991     uintptr_t opr_sz = simd_oprsz(desc);
992     float16 *d = vd, *n = vn, *m = vm, *a = va;
993     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
994     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
995     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
996     uint32_t neg_real = flip ^ neg_imag;
997     intptr_t elements = opr_sz / sizeof(float16);
998     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
999     intptr_t i, j;
1000 
1001     /* Shift boolean to the sign bit so we can xor to negate.  */
1002     neg_real <<= 15;
1003     neg_imag <<= 15;
1004 
1005     for (i = 0; i < elements; i += eltspersegment) {
1006         float16 mr = m[H2(i + 2 * index + 0)];
1007         float16 mi = m[H2(i + 2 * index + 1)];
1008         float16 e1 = neg_real ^ (flip ? mi : mr);
1009         float16 e3 = neg_imag ^ (flip ? mr : mi);
1010 
1011         for (j = i; j < i + eltspersegment; j += 2) {
1012             float16 e2 = n[H2(j + flip)];
1013             float16 e4 = e2;
1014 
1015             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1016             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1017         }
1018     }
1019     clear_tail(d, opr_sz, simd_maxsz(desc));
1020 }
1021 
1022 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1023                          float_status *fpst, uint32_t desc)
1024 {
1025     uintptr_t opr_sz = simd_oprsz(desc);
1026     float32 *d = vd, *n = vn, *m = vm, *a = va;
1027     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1028     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1029     uint32_t neg_real = flip ^ neg_imag;
1030     uintptr_t i;
1031 
1032     /* Shift boolean to the sign bit so we can xor to negate.  */
1033     neg_real <<= 31;
1034     neg_imag <<= 31;
1035 
1036     for (i = 0; i < opr_sz / 4; i += 2) {
1037         float32 e2 = n[H4(i + flip)];
1038         float32 e1 = m[H4(i + flip)] ^ neg_real;
1039         float32 e4 = e2;
1040         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1041 
1042         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1043         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1044     }
1045     clear_tail(d, opr_sz, simd_maxsz(desc));
1046 }
1047 
1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1049                              float_status *fpst, uint32_t desc)
1050 {
1051     uintptr_t opr_sz = simd_oprsz(desc);
1052     float32 *d = vd, *n = vn, *m = vm, *a = va;
1053     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1054     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1055     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1056     uint32_t neg_real = flip ^ neg_imag;
1057     intptr_t elements = opr_sz / sizeof(float32);
1058     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1059     intptr_t i, j;
1060 
1061     /* Shift boolean to the sign bit so we can xor to negate.  */
1062     neg_real <<= 31;
1063     neg_imag <<= 31;
1064 
1065     for (i = 0; i < elements; i += eltspersegment) {
1066         float32 mr = m[H4(i + 2 * index + 0)];
1067         float32 mi = m[H4(i + 2 * index + 1)];
1068         float32 e1 = neg_real ^ (flip ? mi : mr);
1069         float32 e3 = neg_imag ^ (flip ? mr : mi);
1070 
1071         for (j = i; j < i + eltspersegment; j += 2) {
1072             float32 e2 = n[H4(j + flip)];
1073             float32 e4 = e2;
1074 
1075             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1076             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1077         }
1078     }
1079     clear_tail(d, opr_sz, simd_maxsz(desc));
1080 }
1081 
1082 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1083                          float_status *fpst, uint32_t desc)
1084 {
1085     uintptr_t opr_sz = simd_oprsz(desc);
1086     float64 *d = vd, *n = vn, *m = vm, *a = va;
1087     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1088     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1089     uint64_t neg_real = flip ^ neg_imag;
1090     uintptr_t i;
1091 
1092     /* Shift boolean to the sign bit so we can xor to negate.  */
1093     neg_real <<= 63;
1094     neg_imag <<= 63;
1095 
1096     for (i = 0; i < opr_sz / 8; i += 2) {
1097         float64 e2 = n[i + flip];
1098         float64 e1 = m[i + flip] ^ neg_real;
1099         float64 e4 = e2;
1100         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1101 
1102         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1103         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1104     }
1105     clear_tail(d, opr_sz, simd_maxsz(desc));
1106 }
1107 
1108 /*
1109  * Floating point comparisons producing an integer result (all 1s or all 0s).
1110  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1111  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1112  */
1113 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1114 {
1115     return -float16_eq_quiet(op1, op2, stat);
1116 }
1117 
1118 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1119 {
1120     return -float32_eq_quiet(op1, op2, stat);
1121 }
1122 
1123 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1124 {
1125     return -float64_eq_quiet(op1, op2, stat);
1126 }
1127 
1128 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1129 {
1130     return -float16_le(op2, op1, stat);
1131 }
1132 
1133 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1134 {
1135     return -float32_le(op2, op1, stat);
1136 }
1137 
1138 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1139 {
1140     return -float64_le(op2, op1, stat);
1141 }
1142 
1143 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1144 {
1145     return -float16_lt(op2, op1, stat);
1146 }
1147 
1148 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1149 {
1150     return -float32_lt(op2, op1, stat);
1151 }
1152 
1153 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1154 {
1155     return -float64_lt(op2, op1, stat);
1156 }
1157 
1158 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1159 {
1160     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1161 }
1162 
1163 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1164 {
1165     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1166 }
1167 
1168 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1169 {
1170     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1171 }
1172 
1173 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1174 {
1175     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1176 }
1177 
1178 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1179 {
1180     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1181 }
1182 
1183 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1184 {
1185     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1186 }
1187 
1188 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1189 {
1190     if (float16_is_any_nan(x)) {
1191         float_raise(float_flag_invalid, fpst);
1192         return 0;
1193     }
1194     return float16_to_int16_round_to_zero(x, fpst);
1195 }
1196 
1197 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1198 {
1199     if (float16_is_any_nan(x)) {
1200         float_raise(float_flag_invalid, fpst);
1201         return 0;
1202     }
1203     return float16_to_uint16_round_to_zero(x, fpst);
1204 }
1205 
1206 #define DO_2OP(NAME, FUNC, TYPE) \
1207 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1208 {                                                                 \
1209     intptr_t i, oprsz = simd_oprsz(desc);                         \
1210     TYPE *d = vd, *n = vn;                                        \
1211     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1212         d[i] = FUNC(n[i], stat);                                  \
1213     }                                                             \
1214     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1215 }
1216 
1217 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1218 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1219 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1220 
1221 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1222 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1223 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1224 
1225 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1226 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1227 
1228 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1229 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1230 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1231 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1232 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1233 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1234 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1235 DO_2OP(gvec_touszh, vfp_touszh, float16)
1236 
1237 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1238     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1239     {                                                           \
1240         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1241     }
1242 
1243 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1244     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1245     {                                                           \
1246         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1247     }
1248 
1249 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1250     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1251     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1252     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1253     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1254     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1255     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1256 
1257 DO_2OP_CMP0(cgt, cgt, FWD)
1258 DO_2OP_CMP0(cge, cge, FWD)
1259 DO_2OP_CMP0(ceq, ceq, FWD)
1260 DO_2OP_CMP0(clt, cgt, REV)
1261 DO_2OP_CMP0(cle, cge, REV)
1262 
1263 #undef DO_2OP
1264 #undef DO_2OP_CMP0
1265 
1266 /* Floating-point trigonometric starting value.
1267  * See the ARM ARM pseudocode function FPTrigSMul.
1268  */
1269 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1270 {
1271     float16 result = float16_mul(op1, op1, stat);
1272     if (!float16_is_any_nan(result)) {
1273         result = float16_set_sign(result, op2 & 1);
1274     }
1275     return result;
1276 }
1277 
1278 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1279 {
1280     float32 result = float32_mul(op1, op1, stat);
1281     if (!float32_is_any_nan(result)) {
1282         result = float32_set_sign(result, op2 & 1);
1283     }
1284     return result;
1285 }
1286 
1287 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1288 {
1289     float64 result = float64_mul(op1, op1, stat);
1290     if (!float64_is_any_nan(result)) {
1291         result = float64_set_sign(result, op2 & 1);
1292     }
1293     return result;
1294 }
1295 
1296 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1297 {
1298     return float16_abs(float16_sub(op1, op2, stat));
1299 }
1300 
1301 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1302 {
1303     return float32_abs(float32_sub(op1, op2, stat));
1304 }
1305 
1306 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1307 {
1308     return float64_abs(float64_sub(op1, op2, stat));
1309 }
1310 
1311 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1312 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1313 {
1314     float16 r = float16_sub(op1, op2, stat);
1315     return float16_is_any_nan(r) ? r : float16_abs(r);
1316 }
1317 
1318 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1319 {
1320     float32 r = float32_sub(op1, op2, stat);
1321     return float32_is_any_nan(r) ? r : float32_abs(r);
1322 }
1323 
1324 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1325 {
1326     float64 r = float64_sub(op1, op2, stat);
1327     return float64_is_any_nan(r) ? r : float64_abs(r);
1328 }
1329 
1330 /*
1331  * Reciprocal step. These are the AArch32 version which uses a
1332  * non-fused multiply-and-subtract.
1333  */
1334 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1335 {
1336     op1 = float16_squash_input_denormal(op1, stat);
1337     op2 = float16_squash_input_denormal(op2, stat);
1338 
1339     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1340         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1341         return float16_two;
1342     }
1343     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1344 }
1345 
1346 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1347 {
1348     op1 = float32_squash_input_denormal(op1, stat);
1349     op2 = float32_squash_input_denormal(op2, stat);
1350 
1351     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1352         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1353         return float32_two;
1354     }
1355     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1356 }
1357 
1358 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1359 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1360 {
1361     op1 = float16_squash_input_denormal(op1, stat);
1362     op2 = float16_squash_input_denormal(op2, stat);
1363 
1364     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1365         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1366         return float16_one_point_five;
1367     }
1368     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1369     return float16_div(op1, float16_two, stat);
1370 }
1371 
1372 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1373 {
1374     op1 = float32_squash_input_denormal(op1, stat);
1375     op2 = float32_squash_input_denormal(op2, stat);
1376 
1377     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1378         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1379         return float32_one_point_five;
1380     }
1381     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1382     return float32_div(op1, float32_two, stat);
1383 }
1384 
1385 #define DO_3OP(NAME, FUNC, TYPE) \
1386 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1387                   float_status *stat, uint32_t desc)                       \
1388 {                                                                          \
1389     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1390     TYPE *d = vd, *n = vn, *m = vm;                                        \
1391     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1392         d[i] = FUNC(n[i], m[i], stat);                                     \
1393     }                                                                      \
1394     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1395 }
1396 
1397 DO_3OP(gvec_fadd_h, float16_add, float16)
1398 DO_3OP(gvec_fadd_s, float32_add, float32)
1399 DO_3OP(gvec_fadd_d, float64_add, float64)
1400 
1401 DO_3OP(gvec_fsub_h, float16_sub, float16)
1402 DO_3OP(gvec_fsub_s, float32_sub, float32)
1403 DO_3OP(gvec_fsub_d, float64_sub, float64)
1404 
1405 DO_3OP(gvec_fmul_h, float16_mul, float16)
1406 DO_3OP(gvec_fmul_s, float32_mul, float32)
1407 DO_3OP(gvec_fmul_d, float64_mul, float64)
1408 
1409 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1410 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1411 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1412 
1413 DO_3OP(gvec_fabd_h, float16_abd, float16)
1414 DO_3OP(gvec_fabd_s, float32_abd, float32)
1415 DO_3OP(gvec_fabd_d, float64_abd, float64)
1416 
1417 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1418 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1419 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1420 
1421 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1422 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1423 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1424 
1425 DO_3OP(gvec_fcge_h, float16_cge, float16)
1426 DO_3OP(gvec_fcge_s, float32_cge, float32)
1427 DO_3OP(gvec_fcge_d, float64_cge, float64)
1428 
1429 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1430 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1431 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1432 
1433 DO_3OP(gvec_facge_h, float16_acge, float16)
1434 DO_3OP(gvec_facge_s, float32_acge, float32)
1435 DO_3OP(gvec_facge_d, float64_acge, float64)
1436 
1437 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1438 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1439 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1440 
1441 DO_3OP(gvec_fmax_h, float16_max, float16)
1442 DO_3OP(gvec_fmax_s, float32_max, float32)
1443 DO_3OP(gvec_fmax_d, float64_max, float64)
1444 
1445 DO_3OP(gvec_fmin_h, float16_min, float16)
1446 DO_3OP(gvec_fmin_s, float32_min, float32)
1447 DO_3OP(gvec_fmin_d, float64_min, float64)
1448 
1449 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1450 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1451 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1452 
1453 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1454 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1455 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1456 
1457 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1458 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1459 
1460 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1461 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1462 
1463 #ifdef TARGET_AARCH64
1464 DO_3OP(gvec_fdiv_h, float16_div, float16)
1465 DO_3OP(gvec_fdiv_s, float32_div, float32)
1466 DO_3OP(gvec_fdiv_d, float64_div, float64)
1467 
1468 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1469 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1470 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1471 
1472 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1473 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1474 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1475 
1476 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1477 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1478 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1479 
1480 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1481 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1482 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1483 
1484 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1485 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1486 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1487 
1488 #endif
1489 #undef DO_3OP
1490 
1491 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1492 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1493                                  float_status *stat)
1494 {
1495     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1496 }
1497 
1498 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1499                                  float_status *stat)
1500 {
1501     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1502 }
1503 
1504 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1505                                  float_status *stat)
1506 {
1507     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1508 }
1509 
1510 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1511                                  float_status *stat)
1512 {
1513     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1514 }
1515 
1516 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1517 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1518                                 float_status *stat)
1519 {
1520     return float16_muladd(op1, op2, dest, 0, stat);
1521 }
1522 
1523 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1524                                  float_status *stat)
1525 {
1526     return float32_muladd(op1, op2, dest, 0, stat);
1527 }
1528 
1529 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1530                                  float_status *stat)
1531 {
1532     return float64_muladd(op1, op2, dest, 0, stat);
1533 }
1534 
1535 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1536                                  float_status *stat)
1537 {
1538     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1539 }
1540 
1541 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1542                                  float_status *stat)
1543 {
1544     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1545 }
1546 
1547 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1548                                  float_status *stat)
1549 {
1550     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1551 }
1552 
1553 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1554 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1555                   float_status *stat, uint32_t desc)                       \
1556 {                                                                          \
1557     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1558     TYPE *d = vd, *n = vn, *m = vm;                                        \
1559     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1560         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1561     }                                                                      \
1562     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1563 }
1564 
1565 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1566 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1567 
1568 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1569 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1570 
1571 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1572 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1573 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1574 
1575 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1576 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1577 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1578 
1579 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1580  * For AdvSIMD, there is of course only one such vector segment.
1581  */
1582 
1583 #define DO_MUL_IDX(NAME, TYPE, H) \
1584 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1585 {                                                                          \
1586     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1587     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1588     intptr_t idx = simd_data(desc);                                        \
1589     TYPE *d = vd, *n = vn, *m = vm;                                        \
1590     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1591         TYPE mm = m[H(i + idx)];                                           \
1592         for (j = 0; j < segment; j++) {                                    \
1593             d[i + j] = n[i + j] * mm;                                      \
1594         }                                                                  \
1595     }                                                                      \
1596     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1597 }
1598 
1599 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1600 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1601 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1602 
1603 #undef DO_MUL_IDX
1604 
1605 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1606 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1607 {                                                                          \
1608     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1609     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1610     intptr_t idx = simd_data(desc);                                        \
1611     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1612     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1613         TYPE mm = m[H(i + idx)];                                           \
1614         for (j = 0; j < segment; j++) {                                    \
1615             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1616         }                                                                  \
1617     }                                                                      \
1618     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1619 }
1620 
1621 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1622 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1623 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1624 
1625 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1626 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1627 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1628 
1629 #undef DO_MLA_IDX
1630 
1631 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1632 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1633                   float_status *stat, uint32_t desc)                       \
1634 {                                                                          \
1635     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1636     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1637     intptr_t idx = simd_data(desc);                                        \
1638     TYPE *d = vd, *n = vn, *m = vm;                                        \
1639     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1640         TYPE mm = m[H(i + idx)];                                           \
1641         for (j = 0; j < segment; j++) {                                    \
1642             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1643         }                                                                  \
1644     }                                                                      \
1645     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1646 }
1647 
1648 #define nop(N, M, S) (M)
1649 
1650 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1651 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1652 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1653 
1654 #ifdef TARGET_AARCH64
1655 
1656 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1657 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1658 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1659 
1660 #endif
1661 
1662 #undef nop
1663 
1664 /*
1665  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1666  * the fused ops below they assume accumulate both from and into Vd.
1667  */
1668 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1669 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1670 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1671 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1672 
1673 #undef DO_FMUL_IDX
1674 
1675 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1676 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1677                   float_status *stat, uint32_t desc)                       \
1678 {                                                                          \
1679     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1680     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1681     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1682     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1683     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1684     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1685     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1686         TYPE mm = m[H(i + idx)];                                           \
1687         for (j = 0; j < segment; j++) {                                    \
1688             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1689                                      mm, a[i + j], 0, stat);               \
1690         }                                                                  \
1691     }                                                                      \
1692     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1693 }
1694 
1695 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1696 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1697 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1698 
1699 #undef DO_FMLA_IDX
1700 
1701 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1702 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1703 {                                                                          \
1704     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1705     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1706     bool q = false;                                                        \
1707     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1708         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1709         if (dd < MIN) {                                                    \
1710             dd = MIN;                                                      \
1711             q = true;                                                      \
1712         } else if (dd > MAX) {                                             \
1713             dd = MAX;                                                      \
1714             q = true;                                                      \
1715         }                                                                  \
1716         d[i] = dd;                                                         \
1717     }                                                                      \
1718     if (q) {                                                               \
1719         uint32_t *qc = vq;                                                 \
1720         qc[0] = 1;                                                         \
1721     }                                                                      \
1722     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1723 }
1724 
1725 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1726 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1727 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1728 
1729 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1730 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1731 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1732 
1733 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1734 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1735 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1736 
1737 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1738 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1739 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1740 
1741 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1742 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1743 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1744 
1745 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1746 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1747 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1748 
1749 #undef DO_SAT
1750 
1751 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1752                           void *vm, uint32_t desc)
1753 {
1754     intptr_t i, oprsz = simd_oprsz(desc);
1755     uint64_t *d = vd, *n = vn, *m = vm;
1756     bool q = false;
1757 
1758     for (i = 0; i < oprsz / 8; i++) {
1759         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1760         if (dd < nn) {
1761             dd = UINT64_MAX;
1762             q = true;
1763         }
1764         d[i] = dd;
1765     }
1766     if (q) {
1767         uint32_t *qc = vq;
1768         qc[0] = 1;
1769     }
1770     clear_tail(d, oprsz, simd_maxsz(desc));
1771 }
1772 
1773 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1774                           void *vm, uint32_t desc)
1775 {
1776     intptr_t i, oprsz = simd_oprsz(desc);
1777     uint64_t *d = vd, *n = vn, *m = vm;
1778     bool q = false;
1779 
1780     for (i = 0; i < oprsz / 8; i++) {
1781         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1782         if (nn < mm) {
1783             dd = 0;
1784             q = true;
1785         }
1786         d[i] = dd;
1787     }
1788     if (q) {
1789         uint32_t *qc = vq;
1790         qc[0] = 1;
1791     }
1792     clear_tail(d, oprsz, simd_maxsz(desc));
1793 }
1794 
1795 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1796                           void *vm, uint32_t desc)
1797 {
1798     intptr_t i, oprsz = simd_oprsz(desc);
1799     int64_t *d = vd, *n = vn, *m = vm;
1800     bool q = false;
1801 
1802     for (i = 0; i < oprsz / 8; i++) {
1803         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1804         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1805             dd = (nn >> 63) ^ ~INT64_MIN;
1806             q = true;
1807         }
1808         d[i] = dd;
1809     }
1810     if (q) {
1811         uint32_t *qc = vq;
1812         qc[0] = 1;
1813     }
1814     clear_tail(d, oprsz, simd_maxsz(desc));
1815 }
1816 
1817 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1818                           void *vm, uint32_t desc)
1819 {
1820     intptr_t i, oprsz = simd_oprsz(desc);
1821     int64_t *d = vd, *n = vn, *m = vm;
1822     bool q = false;
1823 
1824     for (i = 0; i < oprsz / 8; i++) {
1825         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1826         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1827             dd = (nn >> 63) ^ ~INT64_MIN;
1828             q = true;
1829         }
1830         d[i] = dd;
1831     }
1832     if (q) {
1833         uint32_t *qc = vq;
1834         qc[0] = 1;
1835     }
1836     clear_tail(d, oprsz, simd_maxsz(desc));
1837 }
1838 
1839 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1840                            void *vm, uint32_t desc)
1841 {
1842     intptr_t i, oprsz = simd_oprsz(desc);
1843     uint64_t *d = vd, *n = vn, *m = vm;
1844     bool q = false;
1845 
1846     for (i = 0; i < oprsz / 8; i++) {
1847         uint64_t nn = n[i];
1848         int64_t mm = m[i];
1849         uint64_t dd = nn + mm;
1850 
1851         if (mm < 0) {
1852             if (nn < (uint64_t)-mm) {
1853                 dd = 0;
1854                 q = true;
1855             }
1856         } else {
1857             if (dd < nn) {
1858                 dd = UINT64_MAX;
1859                 q = true;
1860             }
1861         }
1862         d[i] = dd;
1863     }
1864     if (q) {
1865         uint32_t *qc = vq;
1866         qc[0] = 1;
1867     }
1868     clear_tail(d, oprsz, simd_maxsz(desc));
1869 }
1870 
1871 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1872                            void *vm, uint32_t desc)
1873 {
1874     intptr_t i, oprsz = simd_oprsz(desc);
1875     uint64_t *d = vd, *n = vn, *m = vm;
1876     bool q = false;
1877 
1878     for (i = 0; i < oprsz / 8; i++) {
1879         int64_t nn = n[i];
1880         uint64_t mm = m[i];
1881         int64_t dd = nn + mm;
1882 
1883         if (mm > (uint64_t)(INT64_MAX - nn)) {
1884             dd = INT64_MAX;
1885             q = true;
1886         }
1887         d[i] = dd;
1888     }
1889     if (q) {
1890         uint32_t *qc = vq;
1891         qc[0] = 1;
1892     }
1893     clear_tail(d, oprsz, simd_maxsz(desc));
1894 }
1895 
1896 #define DO_SRA(NAME, TYPE)                              \
1897 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1898 {                                                       \
1899     intptr_t i, oprsz = simd_oprsz(desc);               \
1900     int shift = simd_data(desc);                        \
1901     TYPE *d = vd, *n = vn;                              \
1902     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1903         d[i] += n[i] >> shift;                          \
1904     }                                                   \
1905     clear_tail(d, oprsz, simd_maxsz(desc));             \
1906 }
1907 
1908 DO_SRA(gvec_ssra_b, int8_t)
1909 DO_SRA(gvec_ssra_h, int16_t)
1910 DO_SRA(gvec_ssra_s, int32_t)
1911 DO_SRA(gvec_ssra_d, int64_t)
1912 
1913 DO_SRA(gvec_usra_b, uint8_t)
1914 DO_SRA(gvec_usra_h, uint16_t)
1915 DO_SRA(gvec_usra_s, uint32_t)
1916 DO_SRA(gvec_usra_d, uint64_t)
1917 
1918 #undef DO_SRA
1919 
1920 #define DO_RSHR(NAME, TYPE)                             \
1921 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1922 {                                                       \
1923     intptr_t i, oprsz = simd_oprsz(desc);               \
1924     int shift = simd_data(desc);                        \
1925     TYPE *d = vd, *n = vn;                              \
1926     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1927         TYPE tmp = n[i] >> (shift - 1);                 \
1928         d[i] = (tmp >> 1) + (tmp & 1);                  \
1929     }                                                   \
1930     clear_tail(d, oprsz, simd_maxsz(desc));             \
1931 }
1932 
1933 DO_RSHR(gvec_srshr_b, int8_t)
1934 DO_RSHR(gvec_srshr_h, int16_t)
1935 DO_RSHR(gvec_srshr_s, int32_t)
1936 DO_RSHR(gvec_srshr_d, int64_t)
1937 
1938 DO_RSHR(gvec_urshr_b, uint8_t)
1939 DO_RSHR(gvec_urshr_h, uint16_t)
1940 DO_RSHR(gvec_urshr_s, uint32_t)
1941 DO_RSHR(gvec_urshr_d, uint64_t)
1942 
1943 #undef DO_RSHR
1944 
1945 #define DO_RSRA(NAME, TYPE)                             \
1946 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1947 {                                                       \
1948     intptr_t i, oprsz = simd_oprsz(desc);               \
1949     int shift = simd_data(desc);                        \
1950     TYPE *d = vd, *n = vn;                              \
1951     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1952         TYPE tmp = n[i] >> (shift - 1);                 \
1953         d[i] += (tmp >> 1) + (tmp & 1);                 \
1954     }                                                   \
1955     clear_tail(d, oprsz, simd_maxsz(desc));             \
1956 }
1957 
1958 DO_RSRA(gvec_srsra_b, int8_t)
1959 DO_RSRA(gvec_srsra_h, int16_t)
1960 DO_RSRA(gvec_srsra_s, int32_t)
1961 DO_RSRA(gvec_srsra_d, int64_t)
1962 
1963 DO_RSRA(gvec_ursra_b, uint8_t)
1964 DO_RSRA(gvec_ursra_h, uint16_t)
1965 DO_RSRA(gvec_ursra_s, uint32_t)
1966 DO_RSRA(gvec_ursra_d, uint64_t)
1967 
1968 #undef DO_RSRA
1969 
1970 #define DO_SRI(NAME, TYPE)                              \
1971 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1972 {                                                       \
1973     intptr_t i, oprsz = simd_oprsz(desc);               \
1974     int shift = simd_data(desc);                        \
1975     TYPE *d = vd, *n = vn;                              \
1976     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1977         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1978     }                                                   \
1979     clear_tail(d, oprsz, simd_maxsz(desc));             \
1980 }
1981 
1982 DO_SRI(gvec_sri_b, uint8_t)
1983 DO_SRI(gvec_sri_h, uint16_t)
1984 DO_SRI(gvec_sri_s, uint32_t)
1985 DO_SRI(gvec_sri_d, uint64_t)
1986 
1987 #undef DO_SRI
1988 
1989 #define DO_SLI(NAME, TYPE)                              \
1990 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1991 {                                                       \
1992     intptr_t i, oprsz = simd_oprsz(desc);               \
1993     int shift = simd_data(desc);                        \
1994     TYPE *d = vd, *n = vn;                              \
1995     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1996         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1997     }                                                   \
1998     clear_tail(d, oprsz, simd_maxsz(desc));             \
1999 }
2000 
2001 DO_SLI(gvec_sli_b, uint8_t)
2002 DO_SLI(gvec_sli_h, uint16_t)
2003 DO_SLI(gvec_sli_s, uint32_t)
2004 DO_SLI(gvec_sli_d, uint64_t)
2005 
2006 #undef DO_SLI
2007 
2008 /*
2009  * Convert float16 to float32, raising no exceptions and
2010  * preserving exceptional values, including SNaN.
2011  * This is effectively an unpack+repack operation.
2012  */
2013 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2014 {
2015     const int f16_bias = 15;
2016     const int f32_bias = 127;
2017     uint32_t sign = extract32(f16, 15, 1);
2018     uint32_t exp = extract32(f16, 10, 5);
2019     uint32_t frac = extract32(f16, 0, 10);
2020 
2021     if (exp == 0x1f) {
2022         /* Inf or NaN */
2023         exp = 0xff;
2024     } else if (exp == 0) {
2025         /* Zero or denormal.  */
2026         if (frac != 0) {
2027             if (fz16) {
2028                 frac = 0;
2029             } else {
2030                 /*
2031                  * Denormal; these are all normal float32.
2032                  * Shift the fraction so that the msb is at bit 11,
2033                  * then remove bit 11 as the implicit bit of the
2034                  * normalized float32.  Note that we still go through
2035                  * the shift for normal numbers below, to put the
2036                  * float32 fraction at the right place.
2037                  */
2038                 int shift = clz32(frac) - 21;
2039                 frac = (frac << shift) & 0x3ff;
2040                 exp = f32_bias - f16_bias - shift + 1;
2041             }
2042         }
2043     } else {
2044         /* Normal number; adjust the bias.  */
2045         exp += f32_bias - f16_bias;
2046     }
2047     sign <<= 31;
2048     exp <<= 23;
2049     frac <<= 23 - 10;
2050 
2051     return sign | exp | frac;
2052 }
2053 
2054 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2055 {
2056     /*
2057      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2058      * Load the 2nd qword iff is_q & is_2.
2059      * Shift to the 2nd dword iff !is_q & is_2.
2060      * For !is_q & !is_2, the upper bits of the result are garbage.
2061      */
2062     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2063 }
2064 
2065 /*
2066  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2067  * as there is not yet SVE versions that might use blocking.
2068  */
2069 
2070 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2071                      uint32_t desc, bool fz16)
2072 {
2073     intptr_t i, oprsz = simd_oprsz(desc);
2074     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2075     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2076     int is_q = oprsz == 16;
2077     uint64_t n_4, m_4;
2078 
2079     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2080     n_4 = load4_f16(vn, is_q, is_2);
2081     m_4 = load4_f16(vm, is_q, is_2);
2082 
2083     /* Negate all inputs for FMLSL at once.  */
2084     if (is_s) {
2085         n_4 ^= 0x8000800080008000ull;
2086     }
2087 
2088     for (i = 0; i < oprsz / 4; i++) {
2089         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2090         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2091         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2092     }
2093     clear_tail(d, oprsz, simd_maxsz(desc));
2094 }
2095 
2096 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2097                             CPUARMState *env, uint32_t desc)
2098 {
2099     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2100              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2101 }
2102 
2103 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2104                             CPUARMState *env, uint32_t desc)
2105 {
2106     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2107              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2108 }
2109 
2110 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2111                                CPUARMState *env, uint32_t desc)
2112 {
2113     intptr_t i, oprsz = simd_oprsz(desc);
2114     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2115     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2116     float_status *status = &env->vfp.fp_status_a64;
2117     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2118 
2119     for (i = 0; i < oprsz; i += sizeof(float32)) {
2120         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2121         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2122         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2123         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2124         float32 aa = *(float32 *)(va + H1_4(i));
2125 
2126         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2127     }
2128 }
2129 
2130 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2131                          uint32_t desc, bool fz16)
2132 {
2133     intptr_t i, oprsz = simd_oprsz(desc);
2134     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2135     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2136     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2137     int is_q = oprsz == 16;
2138     uint64_t n_4;
2139     float32 m_1;
2140 
2141     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2142     n_4 = load4_f16(vn, is_q, is_2);
2143 
2144     /* Negate all inputs for FMLSL at once.  */
2145     if (is_s) {
2146         n_4 ^= 0x8000800080008000ull;
2147     }
2148 
2149     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2150 
2151     for (i = 0; i < oprsz / 4; i++) {
2152         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2153         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2154     }
2155     clear_tail(d, oprsz, simd_maxsz(desc));
2156 }
2157 
2158 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2159                                 CPUARMState *env, uint32_t desc)
2160 {
2161     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2162                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2163 }
2164 
2165 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2166                                 CPUARMState *env, uint32_t desc)
2167 {
2168     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2169                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2170 }
2171 
2172 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2173                                CPUARMState *env, uint32_t desc)
2174 {
2175     intptr_t i, j, oprsz = simd_oprsz(desc);
2176     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2177     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2178     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2179     float_status *status = &env->vfp.fp_status_a64;
2180     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2181 
2182     for (i = 0; i < oprsz; i += 16) {
2183         float16 mm_16 = *(float16 *)(vm + i + idx);
2184         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2185 
2186         for (j = 0; j < 16; j += sizeof(float32)) {
2187             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2188             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2189             float32 aa = *(float32 *)(va + H1_4(i + j));
2190 
2191             *(float32 *)(vd + H1_4(i + j)) =
2192                 float32_muladd(nn, mm, aa, 0, status);
2193         }
2194     }
2195 }
2196 
2197 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2198 {
2199     intptr_t i, opr_sz = simd_oprsz(desc);
2200     int8_t *d = vd, *n = vn, *m = vm;
2201 
2202     for (i = 0; i < opr_sz; ++i) {
2203         int8_t mm = m[i];
2204         int8_t nn = n[i];
2205         int8_t res = 0;
2206         if (mm >= 0) {
2207             if (mm < 8) {
2208                 res = nn << mm;
2209             }
2210         } else {
2211             res = nn >> (mm > -8 ? -mm : 7);
2212         }
2213         d[i] = res;
2214     }
2215     clear_tail(d, opr_sz, simd_maxsz(desc));
2216 }
2217 
2218 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2219 {
2220     intptr_t i, opr_sz = simd_oprsz(desc);
2221     int16_t *d = vd, *n = vn, *m = vm;
2222 
2223     for (i = 0; i < opr_sz / 2; ++i) {
2224         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2225         int16_t nn = n[i];
2226         int16_t res = 0;
2227         if (mm >= 0) {
2228             if (mm < 16) {
2229                 res = nn << mm;
2230             }
2231         } else {
2232             res = nn >> (mm > -16 ? -mm : 15);
2233         }
2234         d[i] = res;
2235     }
2236     clear_tail(d, opr_sz, simd_maxsz(desc));
2237 }
2238 
2239 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2240 {
2241     intptr_t i, opr_sz = simd_oprsz(desc);
2242     uint8_t *d = vd, *n = vn, *m = vm;
2243 
2244     for (i = 0; i < opr_sz; ++i) {
2245         int8_t mm = m[i];
2246         uint8_t nn = n[i];
2247         uint8_t res = 0;
2248         if (mm >= 0) {
2249             if (mm < 8) {
2250                 res = nn << mm;
2251             }
2252         } else {
2253             if (mm > -8) {
2254                 res = nn >> -mm;
2255             }
2256         }
2257         d[i] = res;
2258     }
2259     clear_tail(d, opr_sz, simd_maxsz(desc));
2260 }
2261 
2262 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2263 {
2264     intptr_t i, opr_sz = simd_oprsz(desc);
2265     uint16_t *d = vd, *n = vn, *m = vm;
2266 
2267     for (i = 0; i < opr_sz / 2; ++i) {
2268         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2269         uint16_t nn = n[i];
2270         uint16_t res = 0;
2271         if (mm >= 0) {
2272             if (mm < 16) {
2273                 res = nn << mm;
2274             }
2275         } else {
2276             if (mm > -16) {
2277                 res = nn >> -mm;
2278             }
2279         }
2280         d[i] = res;
2281     }
2282     clear_tail(d, opr_sz, simd_maxsz(desc));
2283 }
2284 
2285 /*
2286  * 8x8->8 polynomial multiply.
2287  *
2288  * Polynomial multiplication is like integer multiplication except the
2289  * partial products are XORed, not added.
2290  *
2291  * TODO: expose this as a generic vector operation, as it is a common
2292  * crypto building block.
2293  */
2294 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2295 {
2296     intptr_t i, opr_sz = simd_oprsz(desc);
2297     uint64_t *d = vd, *n = vn, *m = vm;
2298 
2299     for (i = 0; i < opr_sz / 8; ++i) {
2300         d[i] = clmul_8x8_low(n[i], m[i]);
2301     }
2302     clear_tail(d, opr_sz, simd_maxsz(desc));
2303 }
2304 
2305 /*
2306  * 64x64->128 polynomial multiply.
2307  * Because of the lanes are not accessed in strict columns,
2308  * this probably cannot be turned into a generic helper.
2309  */
2310 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2311 {
2312     intptr_t i, opr_sz = simd_oprsz(desc);
2313     intptr_t hi = simd_data(desc);
2314     uint64_t *d = vd, *n = vn, *m = vm;
2315 
2316     for (i = 0; i < opr_sz / 8; i += 2) {
2317         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2318         d[i] = int128_getlo(r);
2319         d[i + 1] = int128_gethi(r);
2320     }
2321     clear_tail(d, opr_sz, simd_maxsz(desc));
2322 }
2323 
2324 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2325 {
2326     int hi = simd_data(desc);
2327     uint64_t *d = vd, *n = vn, *m = vm;
2328     uint64_t nn = n[hi], mm = m[hi];
2329 
2330     d[0] = clmul_8x4_packed(nn, mm);
2331     nn >>= 32;
2332     mm >>= 32;
2333     d[1] = clmul_8x4_packed(nn, mm);
2334 
2335     clear_tail(d, 16, simd_maxsz(desc));
2336 }
2337 
2338 #ifdef TARGET_AARCH64
2339 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2340 {
2341     int shift = simd_data(desc) * 8;
2342     intptr_t i, opr_sz = simd_oprsz(desc);
2343     uint64_t *d = vd, *n = vn, *m = vm;
2344 
2345     for (i = 0; i < opr_sz / 8; ++i) {
2346         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2347     }
2348 }
2349 
2350 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2351 {
2352     intptr_t sel = H4(simd_data(desc));
2353     intptr_t i, opr_sz = simd_oprsz(desc);
2354     uint32_t *n = vn, *m = vm;
2355     uint64_t *d = vd;
2356 
2357     for (i = 0; i < opr_sz / 8; ++i) {
2358         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2359     }
2360 }
2361 #endif
2362 
2363 #define DO_CMP0(NAME, TYPE, OP)                         \
2364 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2365 {                                                       \
2366     intptr_t i, opr_sz = simd_oprsz(desc);              \
2367     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2368         TYPE nn = *(TYPE *)(vn + i);                    \
2369         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2370     }                                                   \
2371     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2372 }
2373 
2374 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2375 DO_CMP0(gvec_clt0_b, int8_t, <)
2376 DO_CMP0(gvec_cle0_b, int8_t, <=)
2377 DO_CMP0(gvec_cgt0_b, int8_t, >)
2378 DO_CMP0(gvec_cge0_b, int8_t, >=)
2379 
2380 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2381 DO_CMP0(gvec_clt0_h, int16_t, <)
2382 DO_CMP0(gvec_cle0_h, int16_t, <=)
2383 DO_CMP0(gvec_cgt0_h, int16_t, >)
2384 DO_CMP0(gvec_cge0_h, int16_t, >=)
2385 
2386 #undef DO_CMP0
2387 
2388 #define DO_ABD(NAME, TYPE)                                      \
2389 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2390 {                                                               \
2391     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2392     TYPE *d = vd, *n = vn, *m = vm;                             \
2393                                                                 \
2394     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2395         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2396     }                                                           \
2397     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2398 }
2399 
2400 DO_ABD(gvec_sabd_b, int8_t)
2401 DO_ABD(gvec_sabd_h, int16_t)
2402 DO_ABD(gvec_sabd_s, int32_t)
2403 DO_ABD(gvec_sabd_d, int64_t)
2404 
2405 DO_ABD(gvec_uabd_b, uint8_t)
2406 DO_ABD(gvec_uabd_h, uint16_t)
2407 DO_ABD(gvec_uabd_s, uint32_t)
2408 DO_ABD(gvec_uabd_d, uint64_t)
2409 
2410 #undef DO_ABD
2411 
2412 #define DO_ABA(NAME, TYPE)                                      \
2413 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2414 {                                                               \
2415     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2416     TYPE *d = vd, *n = vn, *m = vm;                             \
2417                                                                 \
2418     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2419         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2420     }                                                           \
2421     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2422 }
2423 
2424 DO_ABA(gvec_saba_b, int8_t)
2425 DO_ABA(gvec_saba_h, int16_t)
2426 DO_ABA(gvec_saba_s, int32_t)
2427 DO_ABA(gvec_saba_d, int64_t)
2428 
2429 DO_ABA(gvec_uaba_b, uint8_t)
2430 DO_ABA(gvec_uaba_h, uint16_t)
2431 DO_ABA(gvec_uaba_s, uint32_t)
2432 DO_ABA(gvec_uaba_d, uint64_t)
2433 
2434 #undef DO_ABA
2435 
2436 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2437 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2438                   float_status *stat, uint32_t desc)                       \
2439 {                                                                          \
2440     ARMVectorReg scratch;                                                  \
2441     intptr_t oprsz = simd_oprsz(desc);                                     \
2442     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2443     TYPE *d = vd, *n = vn, *m = vm;                                        \
2444     if (unlikely(d == m)) {                                                \
2445         m = memcpy(&scratch, m, oprsz);                                    \
2446     }                                                                      \
2447     for (intptr_t i = 0; i < half; ++i) {                                  \
2448         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2449     }                                                                      \
2450     for (intptr_t i = 0; i < half; ++i) {                                  \
2451         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2452     }                                                                      \
2453     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2454 }
2455 
2456 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2457 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2458 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2459 
2460 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2461 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2462 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2463 
2464 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2465 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2466 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2467 
2468 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2469 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2470 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2471 
2472 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2473 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2474 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2475 
2476 #ifdef TARGET_AARCH64
2477 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2478 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2479 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2480 
2481 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2482 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2483 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2484 #endif
2485 
2486 #undef DO_3OP_PAIR
2487 
2488 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2489 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2490 {                                                               \
2491     ARMVectorReg scratch;                                       \
2492     intptr_t oprsz = simd_oprsz(desc);                          \
2493     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2494     TYPE *d = vd, *n = vn, *m = vm;                             \
2495     if (unlikely(d == m)) {                                     \
2496         m = memcpy(&scratch, m, oprsz);                         \
2497     }                                                           \
2498     for (intptr_t i = 0; i < half; ++i) {                       \
2499         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2500     }                                                           \
2501     for (intptr_t i = 0; i < half; ++i) {                       \
2502         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2503     }                                                           \
2504     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2505 }
2506 
2507 #define ADD(A, B) (A + B)
2508 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2509 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2510 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2511 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2512 #undef  ADD
2513 
2514 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2515 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2516 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2517 
2518 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2519 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2520 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2521 
2522 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2523 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2524 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2525 
2526 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2527 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2528 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2529 
2530 #undef DO_3OP_PAIR
2531 
2532 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2533     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2534     {                                                                   \
2535         intptr_t i, oprsz = simd_oprsz(desc);                           \
2536         int shift = simd_data(desc);                                    \
2537         TYPE *d = vd, *n = vn;                                          \
2538         float_status *fpst = stat;                                      \
2539         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2540             d[i] = FUNC(n[i], shift, fpst);                             \
2541         }                                                               \
2542         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2543     }
2544 
2545 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2546 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2547 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2548 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2549 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2550 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2551 
2552 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2553 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2554 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2555 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2556 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2557 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2558 
2559 #undef DO_VCVT_FIXED
2560 
2561 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2562     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2563     {                                                                   \
2564         intptr_t i, oprsz = simd_oprsz(desc);                           \
2565         uint32_t rmode = simd_data(desc);                               \
2566         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2567         TYPE *d = vd, *n = vn;                                          \
2568         set_float_rounding_mode(rmode, fpst);                           \
2569         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2570             d[i] = FUNC(n[i], 0, fpst);                                 \
2571         }                                                               \
2572         set_float_rounding_mode(prev_rmode, fpst);                      \
2573         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2574     }
2575 
2576 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2577 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2578 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2579 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2580 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2581 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2582 
2583 #undef DO_VCVT_RMODE
2584 
2585 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2586     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2587     {                                                                   \
2588         intptr_t i, oprsz = simd_oprsz(desc);                           \
2589         uint32_t rmode = simd_data(desc);                               \
2590         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2591         TYPE *d = vd, *n = vn;                                          \
2592         set_float_rounding_mode(rmode, fpst);                           \
2593         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2594             d[i] = FUNC(n[i], fpst);                                    \
2595         }                                                               \
2596         set_float_rounding_mode(prev_rmode, fpst);                      \
2597         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2598     }
2599 
2600 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2601 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2602 
2603 #undef DO_VRINT_RMODE
2604 
2605 #ifdef TARGET_AARCH64
2606 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2607 {
2608     const uint8_t *indices = vm;
2609     size_t oprsz = simd_oprsz(desc);
2610     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2611     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2612     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2613     union {
2614         uint8_t b[16];
2615         uint64_t d[2];
2616     } result;
2617 
2618     /*
2619      * We must construct the final result in a temp, lest the output
2620      * overlaps the input table.  For TBL, begin with zero; for TBX,
2621      * begin with the original register contents.  Note that we always
2622      * copy 16 bytes here to avoid an extra branch; clearing the high
2623      * bits of the register for oprsz == 8 is handled below.
2624      */
2625     if (is_tbx) {
2626         memcpy(&result, vd, 16);
2627     } else {
2628         memset(&result, 0, 16);
2629     }
2630 
2631     for (size_t i = 0; i < oprsz; ++i) {
2632         uint32_t index = indices[H1(i)];
2633 
2634         if (index < table_len) {
2635             /*
2636              * Convert index (a byte offset into the virtual table
2637              * which is a series of 128-bit vectors concatenated)
2638              * into the correct register element, bearing in mind
2639              * that the table can wrap around from V31 to V0.
2640              */
2641             const uint8_t *table = (const uint8_t *)
2642                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2643             result.b[H1(i)] = table[H1(index % 16)];
2644         }
2645     }
2646 
2647     memcpy(vd, &result, 16);
2648     clear_tail(vd, oprsz, simd_maxsz(desc));
2649 }
2650 #endif
2651 
2652 /*
2653  * NxN -> N highpart multiply
2654  *
2655  * TODO: expose this as a generic vector operation.
2656  */
2657 
2658 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2659 {
2660     intptr_t i, opr_sz = simd_oprsz(desc);
2661     int8_t *d = vd, *n = vn, *m = vm;
2662 
2663     for (i = 0; i < opr_sz; ++i) {
2664         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2665     }
2666     clear_tail(d, opr_sz, simd_maxsz(desc));
2667 }
2668 
2669 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2670 {
2671     intptr_t i, opr_sz = simd_oprsz(desc);
2672     int16_t *d = vd, *n = vn, *m = vm;
2673 
2674     for (i = 0; i < opr_sz / 2; ++i) {
2675         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2676     }
2677     clear_tail(d, opr_sz, simd_maxsz(desc));
2678 }
2679 
2680 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2681 {
2682     intptr_t i, opr_sz = simd_oprsz(desc);
2683     int32_t *d = vd, *n = vn, *m = vm;
2684 
2685     for (i = 0; i < opr_sz / 4; ++i) {
2686         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2687     }
2688     clear_tail(d, opr_sz, simd_maxsz(desc));
2689 }
2690 
2691 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2692 {
2693     intptr_t i, opr_sz = simd_oprsz(desc);
2694     uint64_t *d = vd, *n = vn, *m = vm;
2695     uint64_t discard;
2696 
2697     for (i = 0; i < opr_sz / 8; ++i) {
2698         muls64(&discard, &d[i], n[i], m[i]);
2699     }
2700     clear_tail(d, opr_sz, simd_maxsz(desc));
2701 }
2702 
2703 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2704 {
2705     intptr_t i, opr_sz = simd_oprsz(desc);
2706     uint8_t *d = vd, *n = vn, *m = vm;
2707 
2708     for (i = 0; i < opr_sz; ++i) {
2709         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2710     }
2711     clear_tail(d, opr_sz, simd_maxsz(desc));
2712 }
2713 
2714 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2715 {
2716     intptr_t i, opr_sz = simd_oprsz(desc);
2717     uint16_t *d = vd, *n = vn, *m = vm;
2718 
2719     for (i = 0; i < opr_sz / 2; ++i) {
2720         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2721     }
2722     clear_tail(d, opr_sz, simd_maxsz(desc));
2723 }
2724 
2725 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2726 {
2727     intptr_t i, opr_sz = simd_oprsz(desc);
2728     uint32_t *d = vd, *n = vn, *m = vm;
2729 
2730     for (i = 0; i < opr_sz / 4; ++i) {
2731         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2732     }
2733     clear_tail(d, opr_sz, simd_maxsz(desc));
2734 }
2735 
2736 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2737 {
2738     intptr_t i, opr_sz = simd_oprsz(desc);
2739     uint64_t *d = vd, *n = vn, *m = vm;
2740     uint64_t discard;
2741 
2742     for (i = 0; i < opr_sz / 8; ++i) {
2743         mulu64(&discard, &d[i], n[i], m[i]);
2744     }
2745     clear_tail(d, opr_sz, simd_maxsz(desc));
2746 }
2747 
2748 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2749 {
2750     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2751     int shr = simd_data(desc);
2752     uint64_t *d = vd, *n = vn, *m = vm;
2753 
2754     for (i = 0; i < opr_sz; ++i) {
2755         d[i] = ror64(n[i] ^ m[i], shr);
2756     }
2757     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2758 }
2759 
2760 /*
2761  * Integer matrix-multiply accumulate
2762  */
2763 
2764 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2765 {
2766     int8_t *n = vn, *m = vm;
2767 
2768     for (intptr_t k = 0; k < 8; ++k) {
2769         sum += n[H1(k)] * m[H1(k)];
2770     }
2771     return sum;
2772 }
2773 
2774 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2775 {
2776     uint8_t *n = vn, *m = vm;
2777 
2778     for (intptr_t k = 0; k < 8; ++k) {
2779         sum += n[H1(k)] * m[H1(k)];
2780     }
2781     return sum;
2782 }
2783 
2784 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2785 {
2786     uint8_t *n = vn;
2787     int8_t *m = vm;
2788 
2789     for (intptr_t k = 0; k < 8; ++k) {
2790         sum += n[H1(k)] * m[H1(k)];
2791     }
2792     return sum;
2793 }
2794 
2795 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2796                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2797 {
2798     intptr_t seg, opr_sz = simd_oprsz(desc);
2799 
2800     for (seg = 0; seg < opr_sz; seg += 16) {
2801         uint32_t *d = vd + seg;
2802         uint32_t *a = va + seg;
2803         uint32_t sum0, sum1, sum2, sum3;
2804 
2805         /*
2806          * Process the entire segment at once, writing back the
2807          * results only after we've consumed all of the inputs.
2808          *
2809          * Key to indices by column:
2810          *          i   j                  i             j
2811          */
2812         sum0 = a[H4(0 + 0)];
2813         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2814         sum1 = a[H4(0 + 1)];
2815         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2816         sum2 = a[H4(2 + 0)];
2817         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2818         sum3 = a[H4(2 + 1)];
2819         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2820 
2821         d[H4(0)] = sum0;
2822         d[H4(1)] = sum1;
2823         d[H4(2)] = sum2;
2824         d[H4(3)] = sum3;
2825     }
2826     clear_tail(vd, opr_sz, simd_maxsz(desc));
2827 }
2828 
2829 #define DO_MMLA_B(NAME, INNER) \
2830     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2831     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2832 
2833 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2834 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2835 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2836 
2837 /*
2838  * BFloat16 Dot Product
2839  */
2840 
2841 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2842 {
2843     /*
2844      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2845      * For EBF = 0, we ignore the FPCR bits which determine rounding
2846      * mode and denormal-flushing, and we do unfused multiplies and
2847      * additions with intermediate rounding of all products and sums.
2848      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2849      * and we perform a fused two-way sum-of-products without intermediate
2850      * rounding of the products.
2851      * In either case, we don't set fp exception flags.
2852      *
2853      * EBF is AArch64 only, so even if it's set in the FPCR it has
2854      * no effect on AArch32 instructions.
2855      */
2856     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2857 
2858     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2859     set_default_nan_mode(true, statusp);
2860 
2861     if (ebf) {
2862         /* EBF=1 needs to do a step with round-to-odd semantics */
2863         *oddstatusp = *statusp;
2864         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2865     } else {
2866         set_flush_to_zero(true, statusp);
2867         set_flush_inputs_to_zero(true, statusp);
2868         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2869     }
2870     return ebf;
2871 }
2872 
2873 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2874 {
2875     float32 t1, t2;
2876 
2877     /*
2878      * Extract each BFloat16 from the element pair, and shift
2879      * them such that they become float32.
2880      */
2881     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2882     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2883     t1 = float32_add(t1, t2, fpst);
2884     t1 = float32_add(sum, t1, fpst);
2885 
2886     return t1;
2887 }
2888 
2889 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2890                      float_status *fpst, float_status *fpst_odd)
2891 {
2892     /*
2893      * Compare f16_dotadd() in sme_helper.c, but here we have
2894      * bfloat16 inputs. In particular that means that we do not
2895      * want the FPCR.FZ16 flush semantics, so we use the normal
2896      * float_status for the input handling here.
2897      */
2898     float64 e1r = float32_to_float64(e1 << 16, fpst);
2899     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2900     float64 e2r = float32_to_float64(e2 << 16, fpst);
2901     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2902     float64 t64;
2903     float32 t32;
2904 
2905     /*
2906      * The ARM pseudocode function FPDot performs both multiplies
2907      * and the add with a single rounding operation.  Emulate this
2908      * by performing the first multiply in round-to-odd, then doing
2909      * the second multiply as fused multiply-add, and rounding to
2910      * float32 all in one step.
2911      */
2912     t64 = float64_mul(e1r, e2r, fpst_odd);
2913     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2914 
2915     /* This conversion is exact, because we've already rounded. */
2916     t32 = float64_to_float32(t64, fpst);
2917 
2918     /* The final accumulation step is not fused. */
2919     return float32_add(sum, t32, fpst);
2920 }
2921 
2922 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2923                         CPUARMState *env, uint32_t desc)
2924 {
2925     intptr_t i, opr_sz = simd_oprsz(desc);
2926     float32 *d = vd, *a = va;
2927     uint32_t *n = vn, *m = vm;
2928     float_status fpst, fpst_odd;
2929 
2930     if (is_ebf(env, &fpst, &fpst_odd)) {
2931         for (i = 0; i < opr_sz / 4; ++i) {
2932             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2933         }
2934     } else {
2935         for (i = 0; i < opr_sz / 4; ++i) {
2936             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2937         }
2938     }
2939     clear_tail(d, opr_sz, simd_maxsz(desc));
2940 }
2941 
2942 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2943                             void *va, CPUARMState *env, uint32_t desc)
2944 {
2945     intptr_t i, j, opr_sz = simd_oprsz(desc);
2946     intptr_t index = simd_data(desc);
2947     intptr_t elements = opr_sz / 4;
2948     intptr_t eltspersegment = MIN(16 / 4, elements);
2949     float32 *d = vd, *a = va;
2950     uint32_t *n = vn, *m = vm;
2951     float_status fpst, fpst_odd;
2952 
2953     if (is_ebf(env, &fpst, &fpst_odd)) {
2954         for (i = 0; i < elements; i += eltspersegment) {
2955             uint32_t m_idx = m[i + H4(index)];
2956 
2957             for (j = i; j < i + eltspersegment; j++) {
2958                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2959             }
2960         }
2961     } else {
2962         for (i = 0; i < elements; i += eltspersegment) {
2963             uint32_t m_idx = m[i + H4(index)];
2964 
2965             for (j = i; j < i + eltspersegment; j++) {
2966                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2967             }
2968         }
2969     }
2970     clear_tail(d, opr_sz, simd_maxsz(desc));
2971 }
2972 
2973 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2974                          CPUARMState *env, uint32_t desc)
2975 {
2976     intptr_t s, opr_sz = simd_oprsz(desc);
2977     float32 *d = vd, *a = va;
2978     uint32_t *n = vn, *m = vm;
2979     float_status fpst, fpst_odd;
2980 
2981     if (is_ebf(env, &fpst, &fpst_odd)) {
2982         for (s = 0; s < opr_sz / 4; s += 4) {
2983             float32 sum00, sum01, sum10, sum11;
2984 
2985             /*
2986              * Process the entire segment at once, writing back the
2987              * results only after we've consumed all of the inputs.
2988              *
2989              * Key to indices by column:
2990              *               i   j               i   k             j   k
2991              */
2992             sum00 = a[s + H4(0 + 0)];
2993             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2994             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2995 
2996             sum01 = a[s + H4(0 + 1)];
2997             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2998             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2999 
3000             sum10 = a[s + H4(2 + 0)];
3001             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3002             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3003 
3004             sum11 = a[s + H4(2 + 1)];
3005             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3006             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3007 
3008             d[s + H4(0 + 0)] = sum00;
3009             d[s + H4(0 + 1)] = sum01;
3010             d[s + H4(2 + 0)] = sum10;
3011             d[s + H4(2 + 1)] = sum11;
3012         }
3013     } else {
3014         for (s = 0; s < opr_sz / 4; s += 4) {
3015             float32 sum00, sum01, sum10, sum11;
3016 
3017             /*
3018              * Process the entire segment at once, writing back the
3019              * results only after we've consumed all of the inputs.
3020              *
3021              * Key to indices by column:
3022              *               i   j           i   k             j   k
3023              */
3024             sum00 = a[s + H4(0 + 0)];
3025             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3026             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3027 
3028             sum01 = a[s + H4(0 + 1)];
3029             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3030             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3031 
3032             sum10 = a[s + H4(2 + 0)];
3033             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3034             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3035 
3036             sum11 = a[s + H4(2 + 1)];
3037             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3038             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3039 
3040             d[s + H4(0 + 0)] = sum00;
3041             d[s + H4(0 + 1)] = sum01;
3042             d[s + H4(2 + 0)] = sum10;
3043             d[s + H4(2 + 1)] = sum11;
3044         }
3045     }
3046     clear_tail(d, opr_sz, simd_maxsz(desc));
3047 }
3048 
3049 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3050                          float_status *stat, uint32_t desc)
3051 {
3052     intptr_t i, opr_sz = simd_oprsz(desc);
3053     intptr_t sel = simd_data(desc);
3054     float32 *d = vd, *a = va;
3055     bfloat16 *n = vn, *m = vm;
3056 
3057     for (i = 0; i < opr_sz / 4; ++i) {
3058         float32 nn = n[H2(i * 2 + sel)] << 16;
3059         float32 mm = m[H2(i * 2 + sel)] << 16;
3060         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3061     }
3062     clear_tail(d, opr_sz, simd_maxsz(desc));
3063 }
3064 
3065 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3066                              void *va, float_status *stat, uint32_t desc)
3067 {
3068     intptr_t i, j, opr_sz = simd_oprsz(desc);
3069     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3070     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3071     intptr_t elements = opr_sz / 4;
3072     intptr_t eltspersegment = MIN(16 / 4, elements);
3073     float32 *d = vd, *a = va;
3074     bfloat16 *n = vn, *m = vm;
3075 
3076     for (i = 0; i < elements; i += eltspersegment) {
3077         float32 m_idx = m[H2(2 * i + index)] << 16;
3078 
3079         for (j = i; j < i + eltspersegment; j++) {
3080             float32 n_j = n[H2(2 * j + sel)] << 16;
3081             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3082         }
3083     }
3084     clear_tail(d, opr_sz, simd_maxsz(desc));
3085 }
3086 
3087 #define DO_CLAMP(NAME, TYPE) \
3088 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3089 {                                                                       \
3090     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3091     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3092         TYPE aa = *(TYPE *)(a + i);                                     \
3093         TYPE nn = *(TYPE *)(n + i);                                     \
3094         TYPE mm = *(TYPE *)(m + i);                                     \
3095         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3096         *(TYPE *)(d + i) = dd;                                          \
3097     }                                                                   \
3098     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3099 }
3100 
3101 DO_CLAMP(gvec_sclamp_b, int8_t)
3102 DO_CLAMP(gvec_sclamp_h, int16_t)
3103 DO_CLAMP(gvec_sclamp_s, int32_t)
3104 DO_CLAMP(gvec_sclamp_d, int64_t)
3105 
3106 DO_CLAMP(gvec_uclamp_b, uint8_t)
3107 DO_CLAMP(gvec_uclamp_h, uint16_t)
3108 DO_CLAMP(gvec_uclamp_s, uint32_t)
3109 DO_CLAMP(gvec_uclamp_d, uint64_t)
3110 
3111 /* Bit count in each 8-bit word. */
3112 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3113 {
3114     intptr_t i, opr_sz = simd_oprsz(desc);
3115     uint8_t *d = vd, *n = vn;
3116 
3117     for (i = 0; i < opr_sz; ++i) {
3118         d[i] = ctpop8(n[i]);
3119     }
3120     clear_tail(d, opr_sz, simd_maxsz(desc));
3121 }
3122 
3123 /* Reverse bits in each 8 bit word */
3124 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3125 {
3126     intptr_t i, opr_sz = simd_oprsz(desc);
3127     uint64_t *d = vd, *n = vn;
3128 
3129     for (i = 0; i < opr_sz / 8; ++i) {
3130         d[i] = revbit64(bswap64(n[i]));
3131     }
3132     clear_tail(d, opr_sz, simd_maxsz(desc));
3133 }
3134 
3135 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3136 {
3137     intptr_t i, opr_sz = simd_oprsz(desc);
3138     uint32_t *d = vd, *n = vn;
3139 
3140     for (i = 0; i < opr_sz / 4; ++i) {
3141         d[i] = helper_recpe_u32(n[i]);
3142     }
3143     clear_tail(d, opr_sz, simd_maxsz(desc));
3144 }
3145 
3146 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3147 {
3148     intptr_t i, opr_sz = simd_oprsz(desc);
3149     uint32_t *d = vd, *n = vn;
3150 
3151     for (i = 0; i < opr_sz / 4; ++i) {
3152         d[i] = helper_rsqrte_u32(n[i]);
3153     }
3154     clear_tail(d, opr_sz, simd_maxsz(desc));
3155 }
3156