xref: /qemu/target/arm/tcg/vec_helper.c (revision 538deec62339594ecac4434e278b43b9807085e2)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
883     uint32_t neg_imag = neg_real ^ 1;
884     uintptr_t i;
885 
886     /* Shift boolean to the sign bit so we can xor to negate.  */
887     neg_real <<= 15;
888     neg_imag <<= 15;
889 
890     for (i = 0; i < opr_sz / 2; i += 2) {
891         float16 e0 = n[H2(i)];
892         float16 e1 = m[H2(i + 1)] ^ neg_imag;
893         float16 e2 = n[H2(i + 1)];
894         float16 e3 = m[H2(i)] ^ neg_real;
895 
896         d[H2(i)] = float16_add(e0, e1, fpst);
897         d[H2(i + 1)] = float16_add(e2, e3, fpst);
898     }
899     clear_tail(d, opr_sz, simd_maxsz(desc));
900 }
901 
902 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
903                          float_status *fpst, uint32_t desc)
904 {
905     uintptr_t opr_sz = simd_oprsz(desc);
906     float32 *d = vd;
907     float32 *n = vn;
908     float32 *m = vm;
909     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
910     uint32_t neg_imag = neg_real ^ 1;
911     uintptr_t i;
912 
913     /* Shift boolean to the sign bit so we can xor to negate.  */
914     neg_real <<= 31;
915     neg_imag <<= 31;
916 
917     for (i = 0; i < opr_sz / 4; i += 2) {
918         float32 e0 = n[H4(i)];
919         float32 e1 = m[H4(i + 1)] ^ neg_imag;
920         float32 e2 = n[H4(i + 1)];
921         float32 e3 = m[H4(i)] ^ neg_real;
922 
923         d[H4(i)] = float32_add(e0, e1, fpst);
924         d[H4(i + 1)] = float32_add(e2, e3, fpst);
925     }
926     clear_tail(d, opr_sz, simd_maxsz(desc));
927 }
928 
929 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
930                          float_status *fpst, uint32_t desc)
931 {
932     uintptr_t opr_sz = simd_oprsz(desc);
933     float64 *d = vd;
934     float64 *n = vn;
935     float64 *m = vm;
936     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
937     uint64_t neg_imag = neg_real ^ 1;
938     uintptr_t i;
939 
940     /* Shift boolean to the sign bit so we can xor to negate.  */
941     neg_real <<= 63;
942     neg_imag <<= 63;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1] ^ neg_imag;
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i] ^ neg_real;
949 
950         d[i] = float64_add(e0, e1, fpst);
951         d[i + 1] = float64_add(e2, e3, fpst);
952     }
953     clear_tail(d, opr_sz, simd_maxsz(desc));
954 }
955 
956 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
957                          float_status *fpst, uint32_t desc)
958 {
959     uintptr_t opr_sz = simd_oprsz(desc);
960     float16 *d = vd, *n = vn, *m = vm, *a = va;
961     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
962     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
963     uint32_t neg_real = flip ^ neg_imag;
964     uintptr_t i;
965 
966     /* Shift boolean to the sign bit so we can xor to negate.  */
967     neg_real <<= 15;
968     neg_imag <<= 15;
969 
970     for (i = 0; i < opr_sz / 2; i += 2) {
971         float16 e2 = n[H2(i + flip)];
972         float16 e1 = m[H2(i + flip)] ^ neg_real;
973         float16 e4 = e2;
974         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
975 
976         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
977         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
978     }
979     clear_tail(d, opr_sz, simd_maxsz(desc));
980 }
981 
982 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
983                              float_status *fpst, uint32_t desc)
984 {
985     uintptr_t opr_sz = simd_oprsz(desc);
986     float16 *d = vd, *n = vn, *m = vm, *a = va;
987     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
988     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
989     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
990     uint32_t neg_real = flip ^ neg_imag;
991     intptr_t elements = opr_sz / sizeof(float16);
992     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
993     intptr_t i, j;
994 
995     /* Shift boolean to the sign bit so we can xor to negate.  */
996     neg_real <<= 15;
997     neg_imag <<= 15;
998 
999     for (i = 0; i < elements; i += eltspersegment) {
1000         float16 mr = m[H2(i + 2 * index + 0)];
1001         float16 mi = m[H2(i + 2 * index + 1)];
1002         float16 e1 = neg_real ^ (flip ? mi : mr);
1003         float16 e3 = neg_imag ^ (flip ? mr : mi);
1004 
1005         for (j = i; j < i + eltspersegment; j += 2) {
1006             float16 e2 = n[H2(j + flip)];
1007             float16 e4 = e2;
1008 
1009             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1010             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1011         }
1012     }
1013     clear_tail(d, opr_sz, simd_maxsz(desc));
1014 }
1015 
1016 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1017                          float_status *fpst, uint32_t desc)
1018 {
1019     uintptr_t opr_sz = simd_oprsz(desc);
1020     float32 *d = vd, *n = vn, *m = vm, *a = va;
1021     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1022     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1023     uint32_t neg_real = flip ^ neg_imag;
1024     uintptr_t i;
1025 
1026     /* Shift boolean to the sign bit so we can xor to negate.  */
1027     neg_real <<= 31;
1028     neg_imag <<= 31;
1029 
1030     for (i = 0; i < opr_sz / 4; i += 2) {
1031         float32 e2 = n[H4(i + flip)];
1032         float32 e1 = m[H4(i + flip)] ^ neg_real;
1033         float32 e4 = e2;
1034         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
1035 
1036         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
1037         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
1038     }
1039     clear_tail(d, opr_sz, simd_maxsz(desc));
1040 }
1041 
1042 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1043                              float_status *fpst, uint32_t desc)
1044 {
1045     uintptr_t opr_sz = simd_oprsz(desc);
1046     float32 *d = vd, *n = vn, *m = vm, *a = va;
1047     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1048     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1049     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1050     uint32_t neg_real = flip ^ neg_imag;
1051     intptr_t elements = opr_sz / sizeof(float32);
1052     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1053     intptr_t i, j;
1054 
1055     /* Shift boolean to the sign bit so we can xor to negate.  */
1056     neg_real <<= 31;
1057     neg_imag <<= 31;
1058 
1059     for (i = 0; i < elements; i += eltspersegment) {
1060         float32 mr = m[H4(i + 2 * index + 0)];
1061         float32 mi = m[H4(i + 2 * index + 1)];
1062         float32 e1 = neg_real ^ (flip ? mi : mr);
1063         float32 e3 = neg_imag ^ (flip ? mr : mi);
1064 
1065         for (j = i; j < i + eltspersegment; j += 2) {
1066             float32 e2 = n[H4(j + flip)];
1067             float32 e4 = e2;
1068 
1069             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1070             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1071         }
1072     }
1073     clear_tail(d, opr_sz, simd_maxsz(desc));
1074 }
1075 
1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1077                          float_status *fpst, uint32_t desc)
1078 {
1079     uintptr_t opr_sz = simd_oprsz(desc);
1080     float64 *d = vd, *n = vn, *m = vm, *a = va;
1081     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1082     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1083     uint64_t neg_real = flip ^ neg_imag;
1084     uintptr_t i;
1085 
1086     /* Shift boolean to the sign bit so we can xor to negate.  */
1087     neg_real <<= 63;
1088     neg_imag <<= 63;
1089 
1090     for (i = 0; i < opr_sz / 8; i += 2) {
1091         float64 e2 = n[i + flip];
1092         float64 e1 = m[i + flip] ^ neg_real;
1093         float64 e4 = e2;
1094         float64 e3 = m[i + 1 - flip] ^ neg_imag;
1095 
1096         d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
1097         d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
1098     }
1099     clear_tail(d, opr_sz, simd_maxsz(desc));
1100 }
1101 
1102 /*
1103  * Floating point comparisons producing an integer result (all 1s or all 0s).
1104  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1105  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1106  */
1107 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1108 {
1109     return -float16_eq_quiet(op1, op2, stat);
1110 }
1111 
1112 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1113 {
1114     return -float32_eq_quiet(op1, op2, stat);
1115 }
1116 
1117 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1118 {
1119     return -float64_eq_quiet(op1, op2, stat);
1120 }
1121 
1122 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1123 {
1124     return -float16_le(op2, op1, stat);
1125 }
1126 
1127 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1128 {
1129     return -float32_le(op2, op1, stat);
1130 }
1131 
1132 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1133 {
1134     return -float64_le(op2, op1, stat);
1135 }
1136 
1137 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1138 {
1139     return -float16_lt(op2, op1, stat);
1140 }
1141 
1142 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1143 {
1144     return -float32_lt(op2, op1, stat);
1145 }
1146 
1147 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1148 {
1149     return -float64_lt(op2, op1, stat);
1150 }
1151 
1152 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1153 {
1154     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1155 }
1156 
1157 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1158 {
1159     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1160 }
1161 
1162 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1163 {
1164     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1165 }
1166 
1167 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1168 {
1169     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1170 }
1171 
1172 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1173 {
1174     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1175 }
1176 
1177 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1178 {
1179     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1180 }
1181 
1182 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1183 {
1184     if (float16_is_any_nan(x)) {
1185         float_raise(float_flag_invalid, fpst);
1186         return 0;
1187     }
1188     return float16_to_int16_round_to_zero(x, fpst);
1189 }
1190 
1191 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1192 {
1193     if (float16_is_any_nan(x)) {
1194         float_raise(float_flag_invalid, fpst);
1195         return 0;
1196     }
1197     return float16_to_uint16_round_to_zero(x, fpst);
1198 }
1199 
1200 #define DO_2OP(NAME, FUNC, TYPE) \
1201 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1202 {                                                                 \
1203     intptr_t i, oprsz = simd_oprsz(desc);                         \
1204     TYPE *d = vd, *n = vn;                                        \
1205     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1206         d[i] = FUNC(n[i], stat);                                  \
1207     }                                                             \
1208     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1209 }
1210 
1211 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1212 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1213 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1214 
1215 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1216 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1217 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1218 
1219 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1220 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1221 
1222 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1223 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1224 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1225 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1226 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1227 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1228 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1229 DO_2OP(gvec_touszh, vfp_touszh, float16)
1230 
1231 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1232     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1233     {                                                           \
1234         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1235     }
1236 
1237 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1238     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1239     {                                                           \
1240         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1241     }
1242 
1243 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1244     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1245     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1246     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1247     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1248     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1249     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1250 
1251 DO_2OP_CMP0(cgt, cgt, FWD)
1252 DO_2OP_CMP0(cge, cge, FWD)
1253 DO_2OP_CMP0(ceq, ceq, FWD)
1254 DO_2OP_CMP0(clt, cgt, REV)
1255 DO_2OP_CMP0(cle, cge, REV)
1256 
1257 #undef DO_2OP
1258 #undef DO_2OP_CMP0
1259 
1260 /* Floating-point trigonometric starting value.
1261  * See the ARM ARM pseudocode function FPTrigSMul.
1262  */
1263 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1264 {
1265     float16 result = float16_mul(op1, op1, stat);
1266     if (!float16_is_any_nan(result)) {
1267         result = float16_set_sign(result, op2 & 1);
1268     }
1269     return result;
1270 }
1271 
1272 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1273 {
1274     float32 result = float32_mul(op1, op1, stat);
1275     if (!float32_is_any_nan(result)) {
1276         result = float32_set_sign(result, op2 & 1);
1277     }
1278     return result;
1279 }
1280 
1281 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1282 {
1283     float64 result = float64_mul(op1, op1, stat);
1284     if (!float64_is_any_nan(result)) {
1285         result = float64_set_sign(result, op2 & 1);
1286     }
1287     return result;
1288 }
1289 
1290 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1291 {
1292     return float16_abs(float16_sub(op1, op2, stat));
1293 }
1294 
1295 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1296 {
1297     return float32_abs(float32_sub(op1, op2, stat));
1298 }
1299 
1300 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1301 {
1302     return float64_abs(float64_sub(op1, op2, stat));
1303 }
1304 
1305 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1306 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1307 {
1308     float16 r = float16_sub(op1, op2, stat);
1309     return float16_is_any_nan(r) ? r : float16_abs(r);
1310 }
1311 
1312 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1313 {
1314     float32 r = float32_sub(op1, op2, stat);
1315     return float32_is_any_nan(r) ? r : float32_abs(r);
1316 }
1317 
1318 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1319 {
1320     float64 r = float64_sub(op1, op2, stat);
1321     return float64_is_any_nan(r) ? r : float64_abs(r);
1322 }
1323 
1324 /*
1325  * Reciprocal step. These are the AArch32 version which uses a
1326  * non-fused multiply-and-subtract.
1327  */
1328 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1329 {
1330     op1 = float16_squash_input_denormal(op1, stat);
1331     op2 = float16_squash_input_denormal(op2, stat);
1332 
1333     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1334         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1335         return float16_two;
1336     }
1337     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1338 }
1339 
1340 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1341 {
1342     op1 = float32_squash_input_denormal(op1, stat);
1343     op2 = float32_squash_input_denormal(op2, stat);
1344 
1345     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1346         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1347         return float32_two;
1348     }
1349     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1350 }
1351 
1352 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1353 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1354 {
1355     op1 = float16_squash_input_denormal(op1, stat);
1356     op2 = float16_squash_input_denormal(op2, stat);
1357 
1358     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1359         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1360         return float16_one_point_five;
1361     }
1362     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1363     return float16_div(op1, float16_two, stat);
1364 }
1365 
1366 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1367 {
1368     op1 = float32_squash_input_denormal(op1, stat);
1369     op2 = float32_squash_input_denormal(op2, stat);
1370 
1371     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1372         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1373         return float32_one_point_five;
1374     }
1375     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1376     return float32_div(op1, float32_two, stat);
1377 }
1378 
1379 #define DO_3OP(NAME, FUNC, TYPE) \
1380 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1381                   float_status *stat, uint32_t desc)                       \
1382 {                                                                          \
1383     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1384     TYPE *d = vd, *n = vn, *m = vm;                                        \
1385     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1386         d[i] = FUNC(n[i], m[i], stat);                                     \
1387     }                                                                      \
1388     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1389 }
1390 
1391 DO_3OP(gvec_fadd_h, float16_add, float16)
1392 DO_3OP(gvec_fadd_s, float32_add, float32)
1393 DO_3OP(gvec_fadd_d, float64_add, float64)
1394 
1395 DO_3OP(gvec_fsub_h, float16_sub, float16)
1396 DO_3OP(gvec_fsub_s, float32_sub, float32)
1397 DO_3OP(gvec_fsub_d, float64_sub, float64)
1398 
1399 DO_3OP(gvec_fmul_h, float16_mul, float16)
1400 DO_3OP(gvec_fmul_s, float32_mul, float32)
1401 DO_3OP(gvec_fmul_d, float64_mul, float64)
1402 
1403 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1404 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1405 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1406 
1407 DO_3OP(gvec_fabd_h, float16_abd, float16)
1408 DO_3OP(gvec_fabd_s, float32_abd, float32)
1409 DO_3OP(gvec_fabd_d, float64_abd, float64)
1410 
1411 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1412 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1413 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1414 
1415 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1416 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1417 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1418 
1419 DO_3OP(gvec_fcge_h, float16_cge, float16)
1420 DO_3OP(gvec_fcge_s, float32_cge, float32)
1421 DO_3OP(gvec_fcge_d, float64_cge, float64)
1422 
1423 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1424 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1425 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1426 
1427 DO_3OP(gvec_facge_h, float16_acge, float16)
1428 DO_3OP(gvec_facge_s, float32_acge, float32)
1429 DO_3OP(gvec_facge_d, float64_acge, float64)
1430 
1431 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1432 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1433 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1434 
1435 DO_3OP(gvec_fmax_h, float16_max, float16)
1436 DO_3OP(gvec_fmax_s, float32_max, float32)
1437 DO_3OP(gvec_fmax_d, float64_max, float64)
1438 
1439 DO_3OP(gvec_fmin_h, float16_min, float16)
1440 DO_3OP(gvec_fmin_s, float32_min, float32)
1441 DO_3OP(gvec_fmin_d, float64_min, float64)
1442 
1443 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1444 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1445 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1446 
1447 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1448 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1449 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1450 
1451 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1452 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1453 
1454 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1455 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1456 
1457 #ifdef TARGET_AARCH64
1458 DO_3OP(gvec_fdiv_h, float16_div, float16)
1459 DO_3OP(gvec_fdiv_s, float32_div, float32)
1460 DO_3OP(gvec_fdiv_d, float64_div, float64)
1461 
1462 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1463 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1464 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1465 
1466 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1467 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1468 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1469 
1470 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1471 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1472 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1473 
1474 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1475 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1476 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1477 
1478 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1479 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1480 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1481 
1482 #endif
1483 #undef DO_3OP
1484 
1485 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1486 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1487                                  float_status *stat)
1488 {
1489     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1490 }
1491 
1492 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1493                                  float_status *stat)
1494 {
1495     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1496 }
1497 
1498 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1499                                  float_status *stat)
1500 {
1501     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1502 }
1503 
1504 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1505                                  float_status *stat)
1506 {
1507     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1508 }
1509 
1510 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1511 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1512                                 float_status *stat)
1513 {
1514     return float16_muladd(op1, op2, dest, 0, stat);
1515 }
1516 
1517 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1518                                  float_status *stat)
1519 {
1520     return float32_muladd(op1, op2, dest, 0, stat);
1521 }
1522 
1523 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1524                                  float_status *stat)
1525 {
1526     return float64_muladd(op1, op2, dest, 0, stat);
1527 }
1528 
1529 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1530                                  float_status *stat)
1531 {
1532     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1533 }
1534 
1535 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1536                                  float_status *stat)
1537 {
1538     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1539 }
1540 
1541 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1542                                  float_status *stat)
1543 {
1544     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1545 }
1546 
1547 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1548 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1549                   float_status *stat, uint32_t desc)                       \
1550 {                                                                          \
1551     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1552     TYPE *d = vd, *n = vn, *m = vm;                                        \
1553     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1554         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1555     }                                                                      \
1556     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1557 }
1558 
1559 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1560 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1561 
1562 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1563 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1564 
1565 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1566 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1567 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1568 
1569 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1570 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1571 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1572 
1573 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1574  * For AdvSIMD, there is of course only one such vector segment.
1575  */
1576 
1577 #define DO_MUL_IDX(NAME, TYPE, H) \
1578 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1579 {                                                                          \
1580     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1581     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1582     intptr_t idx = simd_data(desc);                                        \
1583     TYPE *d = vd, *n = vn, *m = vm;                                        \
1584     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1585         TYPE mm = m[H(i + idx)];                                           \
1586         for (j = 0; j < segment; j++) {                                    \
1587             d[i + j] = n[i + j] * mm;                                      \
1588         }                                                                  \
1589     }                                                                      \
1590     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1591 }
1592 
1593 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1594 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1595 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1596 
1597 #undef DO_MUL_IDX
1598 
1599 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1600 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1601 {                                                                          \
1602     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1603     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1604     intptr_t idx = simd_data(desc);                                        \
1605     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1606     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1607         TYPE mm = m[H(i + idx)];                                           \
1608         for (j = 0; j < segment; j++) {                                    \
1609             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1610         }                                                                  \
1611     }                                                                      \
1612     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1613 }
1614 
1615 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1616 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1617 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1618 
1619 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1620 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1621 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1622 
1623 #undef DO_MLA_IDX
1624 
1625 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1626 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1627                   float_status *stat, uint32_t desc)                       \
1628 {                                                                          \
1629     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1630     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1631     intptr_t idx = simd_data(desc);                                        \
1632     TYPE *d = vd, *n = vn, *m = vm;                                        \
1633     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1634         TYPE mm = m[H(i + idx)];                                           \
1635         for (j = 0; j < segment; j++) {                                    \
1636             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1637         }                                                                  \
1638     }                                                                      \
1639     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1640 }
1641 
1642 #define nop(N, M, S) (M)
1643 
1644 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1645 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1646 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1647 
1648 #ifdef TARGET_AARCH64
1649 
1650 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1651 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1652 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1653 
1654 #endif
1655 
1656 #undef nop
1657 
1658 /*
1659  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1660  * the fused ops below they assume accumulate both from and into Vd.
1661  */
1662 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1663 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1664 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1665 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1666 
1667 #undef DO_FMUL_IDX
1668 
1669 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
1670 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1671                   float_status *stat, uint32_t desc)                       \
1672 {                                                                          \
1673     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1674     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1675     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
1676     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
1677     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1678     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
1679     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1680         TYPE mm = m[H(i + idx)];                                           \
1681         for (j = 0; j < segment; j++) {                                    \
1682             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
1683                                      mm, a[i + j], 0, stat);               \
1684         }                                                                  \
1685     }                                                                      \
1686     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1687 }
1688 
1689 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1690 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
1691 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
1692 
1693 #undef DO_FMLA_IDX
1694 
1695 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1696 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1697 {                                                                          \
1698     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1699     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1700     bool q = false;                                                        \
1701     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1702         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1703         if (dd < MIN) {                                                    \
1704             dd = MIN;                                                      \
1705             q = true;                                                      \
1706         } else if (dd > MAX) {                                             \
1707             dd = MAX;                                                      \
1708             q = true;                                                      \
1709         }                                                                  \
1710         d[i] = dd;                                                         \
1711     }                                                                      \
1712     if (q) {                                                               \
1713         uint32_t *qc = vq;                                                 \
1714         qc[0] = 1;                                                         \
1715     }                                                                      \
1716     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1717 }
1718 
1719 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1720 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1721 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1722 
1723 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1724 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1725 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1726 
1727 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1728 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1729 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1730 
1731 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1732 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1733 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1734 
1735 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1736 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1737 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1738 
1739 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1740 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1741 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1742 
1743 #undef DO_SAT
1744 
1745 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1746                           void *vm, uint32_t desc)
1747 {
1748     intptr_t i, oprsz = simd_oprsz(desc);
1749     uint64_t *d = vd, *n = vn, *m = vm;
1750     bool q = false;
1751 
1752     for (i = 0; i < oprsz / 8; i++) {
1753         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1754         if (dd < nn) {
1755             dd = UINT64_MAX;
1756             q = true;
1757         }
1758         d[i] = dd;
1759     }
1760     if (q) {
1761         uint32_t *qc = vq;
1762         qc[0] = 1;
1763     }
1764     clear_tail(d, oprsz, simd_maxsz(desc));
1765 }
1766 
1767 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1768                           void *vm, uint32_t desc)
1769 {
1770     intptr_t i, oprsz = simd_oprsz(desc);
1771     uint64_t *d = vd, *n = vn, *m = vm;
1772     bool q = false;
1773 
1774     for (i = 0; i < oprsz / 8; i++) {
1775         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1776         if (nn < mm) {
1777             dd = 0;
1778             q = true;
1779         }
1780         d[i] = dd;
1781     }
1782     if (q) {
1783         uint32_t *qc = vq;
1784         qc[0] = 1;
1785     }
1786     clear_tail(d, oprsz, simd_maxsz(desc));
1787 }
1788 
1789 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1790                           void *vm, uint32_t desc)
1791 {
1792     intptr_t i, oprsz = simd_oprsz(desc);
1793     int64_t *d = vd, *n = vn, *m = vm;
1794     bool q = false;
1795 
1796     for (i = 0; i < oprsz / 8; i++) {
1797         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1798         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1799             dd = (nn >> 63) ^ ~INT64_MIN;
1800             q = true;
1801         }
1802         d[i] = dd;
1803     }
1804     if (q) {
1805         uint32_t *qc = vq;
1806         qc[0] = 1;
1807     }
1808     clear_tail(d, oprsz, simd_maxsz(desc));
1809 }
1810 
1811 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1812                           void *vm, uint32_t desc)
1813 {
1814     intptr_t i, oprsz = simd_oprsz(desc);
1815     int64_t *d = vd, *n = vn, *m = vm;
1816     bool q = false;
1817 
1818     for (i = 0; i < oprsz / 8; i++) {
1819         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1820         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1821             dd = (nn >> 63) ^ ~INT64_MIN;
1822             q = true;
1823         }
1824         d[i] = dd;
1825     }
1826     if (q) {
1827         uint32_t *qc = vq;
1828         qc[0] = 1;
1829     }
1830     clear_tail(d, oprsz, simd_maxsz(desc));
1831 }
1832 
1833 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1834                            void *vm, uint32_t desc)
1835 {
1836     intptr_t i, oprsz = simd_oprsz(desc);
1837     uint64_t *d = vd, *n = vn, *m = vm;
1838     bool q = false;
1839 
1840     for (i = 0; i < oprsz / 8; i++) {
1841         uint64_t nn = n[i];
1842         int64_t mm = m[i];
1843         uint64_t dd = nn + mm;
1844 
1845         if (mm < 0) {
1846             if (nn < (uint64_t)-mm) {
1847                 dd = 0;
1848                 q = true;
1849             }
1850         } else {
1851             if (dd < nn) {
1852                 dd = UINT64_MAX;
1853                 q = true;
1854             }
1855         }
1856         d[i] = dd;
1857     }
1858     if (q) {
1859         uint32_t *qc = vq;
1860         qc[0] = 1;
1861     }
1862     clear_tail(d, oprsz, simd_maxsz(desc));
1863 }
1864 
1865 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1866                            void *vm, uint32_t desc)
1867 {
1868     intptr_t i, oprsz = simd_oprsz(desc);
1869     uint64_t *d = vd, *n = vn, *m = vm;
1870     bool q = false;
1871 
1872     for (i = 0; i < oprsz / 8; i++) {
1873         int64_t nn = n[i];
1874         uint64_t mm = m[i];
1875         int64_t dd = nn + mm;
1876 
1877         if (mm > (uint64_t)(INT64_MAX - nn)) {
1878             dd = INT64_MAX;
1879             q = true;
1880         }
1881         d[i] = dd;
1882     }
1883     if (q) {
1884         uint32_t *qc = vq;
1885         qc[0] = 1;
1886     }
1887     clear_tail(d, oprsz, simd_maxsz(desc));
1888 }
1889 
1890 #define DO_SRA(NAME, TYPE)                              \
1891 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1892 {                                                       \
1893     intptr_t i, oprsz = simd_oprsz(desc);               \
1894     int shift = simd_data(desc);                        \
1895     TYPE *d = vd, *n = vn;                              \
1896     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1897         d[i] += n[i] >> shift;                          \
1898     }                                                   \
1899     clear_tail(d, oprsz, simd_maxsz(desc));             \
1900 }
1901 
1902 DO_SRA(gvec_ssra_b, int8_t)
1903 DO_SRA(gvec_ssra_h, int16_t)
1904 DO_SRA(gvec_ssra_s, int32_t)
1905 DO_SRA(gvec_ssra_d, int64_t)
1906 
1907 DO_SRA(gvec_usra_b, uint8_t)
1908 DO_SRA(gvec_usra_h, uint16_t)
1909 DO_SRA(gvec_usra_s, uint32_t)
1910 DO_SRA(gvec_usra_d, uint64_t)
1911 
1912 #undef DO_SRA
1913 
1914 #define DO_RSHR(NAME, TYPE)                             \
1915 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1916 {                                                       \
1917     intptr_t i, oprsz = simd_oprsz(desc);               \
1918     int shift = simd_data(desc);                        \
1919     TYPE *d = vd, *n = vn;                              \
1920     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1921         TYPE tmp = n[i] >> (shift - 1);                 \
1922         d[i] = (tmp >> 1) + (tmp & 1);                  \
1923     }                                                   \
1924     clear_tail(d, oprsz, simd_maxsz(desc));             \
1925 }
1926 
1927 DO_RSHR(gvec_srshr_b, int8_t)
1928 DO_RSHR(gvec_srshr_h, int16_t)
1929 DO_RSHR(gvec_srshr_s, int32_t)
1930 DO_RSHR(gvec_srshr_d, int64_t)
1931 
1932 DO_RSHR(gvec_urshr_b, uint8_t)
1933 DO_RSHR(gvec_urshr_h, uint16_t)
1934 DO_RSHR(gvec_urshr_s, uint32_t)
1935 DO_RSHR(gvec_urshr_d, uint64_t)
1936 
1937 #undef DO_RSHR
1938 
1939 #define DO_RSRA(NAME, TYPE)                             \
1940 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1941 {                                                       \
1942     intptr_t i, oprsz = simd_oprsz(desc);               \
1943     int shift = simd_data(desc);                        \
1944     TYPE *d = vd, *n = vn;                              \
1945     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1946         TYPE tmp = n[i] >> (shift - 1);                 \
1947         d[i] += (tmp >> 1) + (tmp & 1);                 \
1948     }                                                   \
1949     clear_tail(d, oprsz, simd_maxsz(desc));             \
1950 }
1951 
1952 DO_RSRA(gvec_srsra_b, int8_t)
1953 DO_RSRA(gvec_srsra_h, int16_t)
1954 DO_RSRA(gvec_srsra_s, int32_t)
1955 DO_RSRA(gvec_srsra_d, int64_t)
1956 
1957 DO_RSRA(gvec_ursra_b, uint8_t)
1958 DO_RSRA(gvec_ursra_h, uint16_t)
1959 DO_RSRA(gvec_ursra_s, uint32_t)
1960 DO_RSRA(gvec_ursra_d, uint64_t)
1961 
1962 #undef DO_RSRA
1963 
1964 #define DO_SRI(NAME, TYPE)                              \
1965 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1966 {                                                       \
1967     intptr_t i, oprsz = simd_oprsz(desc);               \
1968     int shift = simd_data(desc);                        \
1969     TYPE *d = vd, *n = vn;                              \
1970     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1971         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1972     }                                                   \
1973     clear_tail(d, oprsz, simd_maxsz(desc));             \
1974 }
1975 
1976 DO_SRI(gvec_sri_b, uint8_t)
1977 DO_SRI(gvec_sri_h, uint16_t)
1978 DO_SRI(gvec_sri_s, uint32_t)
1979 DO_SRI(gvec_sri_d, uint64_t)
1980 
1981 #undef DO_SRI
1982 
1983 #define DO_SLI(NAME, TYPE)                              \
1984 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1985 {                                                       \
1986     intptr_t i, oprsz = simd_oprsz(desc);               \
1987     int shift = simd_data(desc);                        \
1988     TYPE *d = vd, *n = vn;                              \
1989     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1990         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1991     }                                                   \
1992     clear_tail(d, oprsz, simd_maxsz(desc));             \
1993 }
1994 
1995 DO_SLI(gvec_sli_b, uint8_t)
1996 DO_SLI(gvec_sli_h, uint16_t)
1997 DO_SLI(gvec_sli_s, uint32_t)
1998 DO_SLI(gvec_sli_d, uint64_t)
1999 
2000 #undef DO_SLI
2001 
2002 /*
2003  * Convert float16 to float32, raising no exceptions and
2004  * preserving exceptional values, including SNaN.
2005  * This is effectively an unpack+repack operation.
2006  */
2007 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2008 {
2009     const int f16_bias = 15;
2010     const int f32_bias = 127;
2011     uint32_t sign = extract32(f16, 15, 1);
2012     uint32_t exp = extract32(f16, 10, 5);
2013     uint32_t frac = extract32(f16, 0, 10);
2014 
2015     if (exp == 0x1f) {
2016         /* Inf or NaN */
2017         exp = 0xff;
2018     } else if (exp == 0) {
2019         /* Zero or denormal.  */
2020         if (frac != 0) {
2021             if (fz16) {
2022                 frac = 0;
2023             } else {
2024                 /*
2025                  * Denormal; these are all normal float32.
2026                  * Shift the fraction so that the msb is at bit 11,
2027                  * then remove bit 11 as the implicit bit of the
2028                  * normalized float32.  Note that we still go through
2029                  * the shift for normal numbers below, to put the
2030                  * float32 fraction at the right place.
2031                  */
2032                 int shift = clz32(frac) - 21;
2033                 frac = (frac << shift) & 0x3ff;
2034                 exp = f32_bias - f16_bias - shift + 1;
2035             }
2036         }
2037     } else {
2038         /* Normal number; adjust the bias.  */
2039         exp += f32_bias - f16_bias;
2040     }
2041     sign <<= 31;
2042     exp <<= 23;
2043     frac <<= 23 - 10;
2044 
2045     return sign | exp | frac;
2046 }
2047 
2048 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2049 {
2050     /*
2051      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2052      * Load the 2nd qword iff is_q & is_2.
2053      * Shift to the 2nd dword iff !is_q & is_2.
2054      * For !is_q & !is_2, the upper bits of the result are garbage.
2055      */
2056     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2057 }
2058 
2059 /*
2060  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2061  * as there is not yet SVE versions that might use blocking.
2062  */
2063 
2064 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2065                      uint32_t desc, bool fz16)
2066 {
2067     intptr_t i, oprsz = simd_oprsz(desc);
2068     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2069     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2070     int is_q = oprsz == 16;
2071     uint64_t n_4, m_4;
2072 
2073     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2074     n_4 = load4_f16(vn, is_q, is_2);
2075     m_4 = load4_f16(vm, is_q, is_2);
2076 
2077     /* Negate all inputs for FMLSL at once.  */
2078     if (is_s) {
2079         n_4 ^= 0x8000800080008000ull;
2080     }
2081 
2082     for (i = 0; i < oprsz / 4; i++) {
2083         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2084         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2085         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2086     }
2087     clear_tail(d, oprsz, simd_maxsz(desc));
2088 }
2089 
2090 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2091                             CPUARMState *env, uint32_t desc)
2092 {
2093     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2094              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2095 }
2096 
2097 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2098                             CPUARMState *env, uint32_t desc)
2099 {
2100     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2101              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2102 }
2103 
2104 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2105                                CPUARMState *env, uint32_t desc)
2106 {
2107     intptr_t i, oprsz = simd_oprsz(desc);
2108     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2109     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2110     float_status *status = &env->vfp.fp_status_a64;
2111     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2112 
2113     for (i = 0; i < oprsz; i += sizeof(float32)) {
2114         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2115         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2116         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2117         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2118         float32 aa = *(float32 *)(va + H1_4(i));
2119 
2120         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2121     }
2122 }
2123 
2124 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2125                          uint32_t desc, bool fz16)
2126 {
2127     intptr_t i, oprsz = simd_oprsz(desc);
2128     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2129     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2130     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2131     int is_q = oprsz == 16;
2132     uint64_t n_4;
2133     float32 m_1;
2134 
2135     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2136     n_4 = load4_f16(vn, is_q, is_2);
2137 
2138     /* Negate all inputs for FMLSL at once.  */
2139     if (is_s) {
2140         n_4 ^= 0x8000800080008000ull;
2141     }
2142 
2143     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2144 
2145     for (i = 0; i < oprsz / 4; i++) {
2146         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2147         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2148     }
2149     clear_tail(d, oprsz, simd_maxsz(desc));
2150 }
2151 
2152 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2153                                 CPUARMState *env, uint32_t desc)
2154 {
2155     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2156                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2157 }
2158 
2159 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2160                                 CPUARMState *env, uint32_t desc)
2161 {
2162     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2163                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2164 }
2165 
2166 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2167                                CPUARMState *env, uint32_t desc)
2168 {
2169     intptr_t i, j, oprsz = simd_oprsz(desc);
2170     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2171     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2172     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2173     float_status *status = &env->vfp.fp_status_a64;
2174     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2175 
2176     for (i = 0; i < oprsz; i += 16) {
2177         float16 mm_16 = *(float16 *)(vm + i + idx);
2178         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2179 
2180         for (j = 0; j < 16; j += sizeof(float32)) {
2181             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2182             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2183             float32 aa = *(float32 *)(va + H1_4(i + j));
2184 
2185             *(float32 *)(vd + H1_4(i + j)) =
2186                 float32_muladd(nn, mm, aa, 0, status);
2187         }
2188     }
2189 }
2190 
2191 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2192 {
2193     intptr_t i, opr_sz = simd_oprsz(desc);
2194     int8_t *d = vd, *n = vn, *m = vm;
2195 
2196     for (i = 0; i < opr_sz; ++i) {
2197         int8_t mm = m[i];
2198         int8_t nn = n[i];
2199         int8_t res = 0;
2200         if (mm >= 0) {
2201             if (mm < 8) {
2202                 res = nn << mm;
2203             }
2204         } else {
2205             res = nn >> (mm > -8 ? -mm : 7);
2206         }
2207         d[i] = res;
2208     }
2209     clear_tail(d, opr_sz, simd_maxsz(desc));
2210 }
2211 
2212 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2213 {
2214     intptr_t i, opr_sz = simd_oprsz(desc);
2215     int16_t *d = vd, *n = vn, *m = vm;
2216 
2217     for (i = 0; i < opr_sz / 2; ++i) {
2218         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2219         int16_t nn = n[i];
2220         int16_t res = 0;
2221         if (mm >= 0) {
2222             if (mm < 16) {
2223                 res = nn << mm;
2224             }
2225         } else {
2226             res = nn >> (mm > -16 ? -mm : 15);
2227         }
2228         d[i] = res;
2229     }
2230     clear_tail(d, opr_sz, simd_maxsz(desc));
2231 }
2232 
2233 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2234 {
2235     intptr_t i, opr_sz = simd_oprsz(desc);
2236     uint8_t *d = vd, *n = vn, *m = vm;
2237 
2238     for (i = 0; i < opr_sz; ++i) {
2239         int8_t mm = m[i];
2240         uint8_t nn = n[i];
2241         uint8_t res = 0;
2242         if (mm >= 0) {
2243             if (mm < 8) {
2244                 res = nn << mm;
2245             }
2246         } else {
2247             if (mm > -8) {
2248                 res = nn >> -mm;
2249             }
2250         }
2251         d[i] = res;
2252     }
2253     clear_tail(d, opr_sz, simd_maxsz(desc));
2254 }
2255 
2256 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2257 {
2258     intptr_t i, opr_sz = simd_oprsz(desc);
2259     uint16_t *d = vd, *n = vn, *m = vm;
2260 
2261     for (i = 0; i < opr_sz / 2; ++i) {
2262         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2263         uint16_t nn = n[i];
2264         uint16_t res = 0;
2265         if (mm >= 0) {
2266             if (mm < 16) {
2267                 res = nn << mm;
2268             }
2269         } else {
2270             if (mm > -16) {
2271                 res = nn >> -mm;
2272             }
2273         }
2274         d[i] = res;
2275     }
2276     clear_tail(d, opr_sz, simd_maxsz(desc));
2277 }
2278 
2279 /*
2280  * 8x8->8 polynomial multiply.
2281  *
2282  * Polynomial multiplication is like integer multiplication except the
2283  * partial products are XORed, not added.
2284  *
2285  * TODO: expose this as a generic vector operation, as it is a common
2286  * crypto building block.
2287  */
2288 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2289 {
2290     intptr_t i, opr_sz = simd_oprsz(desc);
2291     uint64_t *d = vd, *n = vn, *m = vm;
2292 
2293     for (i = 0; i < opr_sz / 8; ++i) {
2294         d[i] = clmul_8x8_low(n[i], m[i]);
2295     }
2296     clear_tail(d, opr_sz, simd_maxsz(desc));
2297 }
2298 
2299 /*
2300  * 64x64->128 polynomial multiply.
2301  * Because of the lanes are not accessed in strict columns,
2302  * this probably cannot be turned into a generic helper.
2303  */
2304 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2305 {
2306     intptr_t i, opr_sz = simd_oprsz(desc);
2307     intptr_t hi = simd_data(desc);
2308     uint64_t *d = vd, *n = vn, *m = vm;
2309 
2310     for (i = 0; i < opr_sz / 8; i += 2) {
2311         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2312         d[i] = int128_getlo(r);
2313         d[i + 1] = int128_gethi(r);
2314     }
2315     clear_tail(d, opr_sz, simd_maxsz(desc));
2316 }
2317 
2318 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2319 {
2320     int hi = simd_data(desc);
2321     uint64_t *d = vd, *n = vn, *m = vm;
2322     uint64_t nn = n[hi], mm = m[hi];
2323 
2324     d[0] = clmul_8x4_packed(nn, mm);
2325     nn >>= 32;
2326     mm >>= 32;
2327     d[1] = clmul_8x4_packed(nn, mm);
2328 
2329     clear_tail(d, 16, simd_maxsz(desc));
2330 }
2331 
2332 #ifdef TARGET_AARCH64
2333 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2334 {
2335     int shift = simd_data(desc) * 8;
2336     intptr_t i, opr_sz = simd_oprsz(desc);
2337     uint64_t *d = vd, *n = vn, *m = vm;
2338 
2339     for (i = 0; i < opr_sz / 8; ++i) {
2340         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2341     }
2342 }
2343 
2344 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2345 {
2346     intptr_t sel = H4(simd_data(desc));
2347     intptr_t i, opr_sz = simd_oprsz(desc);
2348     uint32_t *n = vn, *m = vm;
2349     uint64_t *d = vd;
2350 
2351     for (i = 0; i < opr_sz / 8; ++i) {
2352         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2353     }
2354 }
2355 #endif
2356 
2357 #define DO_CMP0(NAME, TYPE, OP)                         \
2358 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2359 {                                                       \
2360     intptr_t i, opr_sz = simd_oprsz(desc);              \
2361     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2362         TYPE nn = *(TYPE *)(vn + i);                    \
2363         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2364     }                                                   \
2365     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2366 }
2367 
2368 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2369 DO_CMP0(gvec_clt0_b, int8_t, <)
2370 DO_CMP0(gvec_cle0_b, int8_t, <=)
2371 DO_CMP0(gvec_cgt0_b, int8_t, >)
2372 DO_CMP0(gvec_cge0_b, int8_t, >=)
2373 
2374 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2375 DO_CMP0(gvec_clt0_h, int16_t, <)
2376 DO_CMP0(gvec_cle0_h, int16_t, <=)
2377 DO_CMP0(gvec_cgt0_h, int16_t, >)
2378 DO_CMP0(gvec_cge0_h, int16_t, >=)
2379 
2380 #undef DO_CMP0
2381 
2382 #define DO_ABD(NAME, TYPE)                                      \
2383 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2384 {                                                               \
2385     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2386     TYPE *d = vd, *n = vn, *m = vm;                             \
2387                                                                 \
2388     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2389         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2390     }                                                           \
2391     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2392 }
2393 
2394 DO_ABD(gvec_sabd_b, int8_t)
2395 DO_ABD(gvec_sabd_h, int16_t)
2396 DO_ABD(gvec_sabd_s, int32_t)
2397 DO_ABD(gvec_sabd_d, int64_t)
2398 
2399 DO_ABD(gvec_uabd_b, uint8_t)
2400 DO_ABD(gvec_uabd_h, uint16_t)
2401 DO_ABD(gvec_uabd_s, uint32_t)
2402 DO_ABD(gvec_uabd_d, uint64_t)
2403 
2404 #undef DO_ABD
2405 
2406 #define DO_ABA(NAME, TYPE)                                      \
2407 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2408 {                                                               \
2409     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2410     TYPE *d = vd, *n = vn, *m = vm;                             \
2411                                                                 \
2412     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2413         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2414     }                                                           \
2415     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2416 }
2417 
2418 DO_ABA(gvec_saba_b, int8_t)
2419 DO_ABA(gvec_saba_h, int16_t)
2420 DO_ABA(gvec_saba_s, int32_t)
2421 DO_ABA(gvec_saba_d, int64_t)
2422 
2423 DO_ABA(gvec_uaba_b, uint8_t)
2424 DO_ABA(gvec_uaba_h, uint16_t)
2425 DO_ABA(gvec_uaba_s, uint32_t)
2426 DO_ABA(gvec_uaba_d, uint64_t)
2427 
2428 #undef DO_ABA
2429 
2430 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2431 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2432                   float_status *stat, uint32_t desc)                       \
2433 {                                                                          \
2434     ARMVectorReg scratch;                                                  \
2435     intptr_t oprsz = simd_oprsz(desc);                                     \
2436     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2437     TYPE *d = vd, *n = vn, *m = vm;                                        \
2438     if (unlikely(d == m)) {                                                \
2439         m = memcpy(&scratch, m, oprsz);                                    \
2440     }                                                                      \
2441     for (intptr_t i = 0; i < half; ++i) {                                  \
2442         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2443     }                                                                      \
2444     for (intptr_t i = 0; i < half; ++i) {                                  \
2445         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2446     }                                                                      \
2447     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2448 }
2449 
2450 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2451 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2452 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2453 
2454 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2455 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2456 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2457 
2458 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2459 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2460 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2461 
2462 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2463 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2464 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2465 
2466 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2467 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2468 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2469 
2470 #ifdef TARGET_AARCH64
2471 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2472 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2473 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2474 
2475 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2476 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2477 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2478 #endif
2479 
2480 #undef DO_3OP_PAIR
2481 
2482 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2483 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2484 {                                                               \
2485     ARMVectorReg scratch;                                       \
2486     intptr_t oprsz = simd_oprsz(desc);                          \
2487     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2488     TYPE *d = vd, *n = vn, *m = vm;                             \
2489     if (unlikely(d == m)) {                                     \
2490         m = memcpy(&scratch, m, oprsz);                         \
2491     }                                                           \
2492     for (intptr_t i = 0; i < half; ++i) {                       \
2493         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2494     }                                                           \
2495     for (intptr_t i = 0; i < half; ++i) {                       \
2496         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2497     }                                                           \
2498     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2499 }
2500 
2501 #define ADD(A, B) (A + B)
2502 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2503 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2504 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2505 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2506 #undef  ADD
2507 
2508 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2509 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2510 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2511 
2512 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2513 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2514 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2515 
2516 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2517 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2518 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2519 
2520 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2521 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2522 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2523 
2524 #undef DO_3OP_PAIR
2525 
2526 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2527     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2528     {                                                                   \
2529         intptr_t i, oprsz = simd_oprsz(desc);                           \
2530         int shift = simd_data(desc);                                    \
2531         TYPE *d = vd, *n = vn;                                          \
2532         float_status *fpst = stat;                                      \
2533         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2534             d[i] = FUNC(n[i], shift, fpst);                             \
2535         }                                                               \
2536         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2537     }
2538 
2539 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2540 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2541 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2542 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2543 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2544 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2545 
2546 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2547 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2548 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2549 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2550 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2551 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2552 
2553 #undef DO_VCVT_FIXED
2554 
2555 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2556     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2557     {                                                                   \
2558         intptr_t i, oprsz = simd_oprsz(desc);                           \
2559         uint32_t rmode = simd_data(desc);                               \
2560         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2561         TYPE *d = vd, *n = vn;                                          \
2562         set_float_rounding_mode(rmode, fpst);                           \
2563         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2564             d[i] = FUNC(n[i], 0, fpst);                                 \
2565         }                                                               \
2566         set_float_rounding_mode(prev_rmode, fpst);                      \
2567         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2568     }
2569 
2570 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2571 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2572 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2573 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2574 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2575 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2576 
2577 #undef DO_VCVT_RMODE
2578 
2579 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2580     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2581     {                                                                   \
2582         intptr_t i, oprsz = simd_oprsz(desc);                           \
2583         uint32_t rmode = simd_data(desc);                               \
2584         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2585         TYPE *d = vd, *n = vn;                                          \
2586         set_float_rounding_mode(rmode, fpst);                           \
2587         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2588             d[i] = FUNC(n[i], fpst);                                    \
2589         }                                                               \
2590         set_float_rounding_mode(prev_rmode, fpst);                      \
2591         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2592     }
2593 
2594 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2595 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2596 
2597 #undef DO_VRINT_RMODE
2598 
2599 #ifdef TARGET_AARCH64
2600 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2601 {
2602     const uint8_t *indices = vm;
2603     size_t oprsz = simd_oprsz(desc);
2604     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2605     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2606     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2607     union {
2608         uint8_t b[16];
2609         uint64_t d[2];
2610     } result;
2611 
2612     /*
2613      * We must construct the final result in a temp, lest the output
2614      * overlaps the input table.  For TBL, begin with zero; for TBX,
2615      * begin with the original register contents.  Note that we always
2616      * copy 16 bytes here to avoid an extra branch; clearing the high
2617      * bits of the register for oprsz == 8 is handled below.
2618      */
2619     if (is_tbx) {
2620         memcpy(&result, vd, 16);
2621     } else {
2622         memset(&result, 0, 16);
2623     }
2624 
2625     for (size_t i = 0; i < oprsz; ++i) {
2626         uint32_t index = indices[H1(i)];
2627 
2628         if (index < table_len) {
2629             /*
2630              * Convert index (a byte offset into the virtual table
2631              * which is a series of 128-bit vectors concatenated)
2632              * into the correct register element, bearing in mind
2633              * that the table can wrap around from V31 to V0.
2634              */
2635             const uint8_t *table = (const uint8_t *)
2636                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2637             result.b[H1(i)] = table[H1(index % 16)];
2638         }
2639     }
2640 
2641     memcpy(vd, &result, 16);
2642     clear_tail(vd, oprsz, simd_maxsz(desc));
2643 }
2644 #endif
2645 
2646 /*
2647  * NxN -> N highpart multiply
2648  *
2649  * TODO: expose this as a generic vector operation.
2650  */
2651 
2652 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2653 {
2654     intptr_t i, opr_sz = simd_oprsz(desc);
2655     int8_t *d = vd, *n = vn, *m = vm;
2656 
2657     for (i = 0; i < opr_sz; ++i) {
2658         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2659     }
2660     clear_tail(d, opr_sz, simd_maxsz(desc));
2661 }
2662 
2663 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2664 {
2665     intptr_t i, opr_sz = simd_oprsz(desc);
2666     int16_t *d = vd, *n = vn, *m = vm;
2667 
2668     for (i = 0; i < opr_sz / 2; ++i) {
2669         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2670     }
2671     clear_tail(d, opr_sz, simd_maxsz(desc));
2672 }
2673 
2674 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2675 {
2676     intptr_t i, opr_sz = simd_oprsz(desc);
2677     int32_t *d = vd, *n = vn, *m = vm;
2678 
2679     for (i = 0; i < opr_sz / 4; ++i) {
2680         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2681     }
2682     clear_tail(d, opr_sz, simd_maxsz(desc));
2683 }
2684 
2685 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2686 {
2687     intptr_t i, opr_sz = simd_oprsz(desc);
2688     uint64_t *d = vd, *n = vn, *m = vm;
2689     uint64_t discard;
2690 
2691     for (i = 0; i < opr_sz / 8; ++i) {
2692         muls64(&discard, &d[i], n[i], m[i]);
2693     }
2694     clear_tail(d, opr_sz, simd_maxsz(desc));
2695 }
2696 
2697 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2698 {
2699     intptr_t i, opr_sz = simd_oprsz(desc);
2700     uint8_t *d = vd, *n = vn, *m = vm;
2701 
2702     for (i = 0; i < opr_sz; ++i) {
2703         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2704     }
2705     clear_tail(d, opr_sz, simd_maxsz(desc));
2706 }
2707 
2708 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2709 {
2710     intptr_t i, opr_sz = simd_oprsz(desc);
2711     uint16_t *d = vd, *n = vn, *m = vm;
2712 
2713     for (i = 0; i < opr_sz / 2; ++i) {
2714         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2715     }
2716     clear_tail(d, opr_sz, simd_maxsz(desc));
2717 }
2718 
2719 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2720 {
2721     intptr_t i, opr_sz = simd_oprsz(desc);
2722     uint32_t *d = vd, *n = vn, *m = vm;
2723 
2724     for (i = 0; i < opr_sz / 4; ++i) {
2725         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2726     }
2727     clear_tail(d, opr_sz, simd_maxsz(desc));
2728 }
2729 
2730 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2731 {
2732     intptr_t i, opr_sz = simd_oprsz(desc);
2733     uint64_t *d = vd, *n = vn, *m = vm;
2734     uint64_t discard;
2735 
2736     for (i = 0; i < opr_sz / 8; ++i) {
2737         mulu64(&discard, &d[i], n[i], m[i]);
2738     }
2739     clear_tail(d, opr_sz, simd_maxsz(desc));
2740 }
2741 
2742 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2743 {
2744     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2745     int shr = simd_data(desc);
2746     uint64_t *d = vd, *n = vn, *m = vm;
2747 
2748     for (i = 0; i < opr_sz; ++i) {
2749         d[i] = ror64(n[i] ^ m[i], shr);
2750     }
2751     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2752 }
2753 
2754 /*
2755  * Integer matrix-multiply accumulate
2756  */
2757 
2758 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2759 {
2760     int8_t *n = vn, *m = vm;
2761 
2762     for (intptr_t k = 0; k < 8; ++k) {
2763         sum += n[H1(k)] * m[H1(k)];
2764     }
2765     return sum;
2766 }
2767 
2768 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2769 {
2770     uint8_t *n = vn, *m = vm;
2771 
2772     for (intptr_t k = 0; k < 8; ++k) {
2773         sum += n[H1(k)] * m[H1(k)];
2774     }
2775     return sum;
2776 }
2777 
2778 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2779 {
2780     uint8_t *n = vn;
2781     int8_t *m = vm;
2782 
2783     for (intptr_t k = 0; k < 8; ++k) {
2784         sum += n[H1(k)] * m[H1(k)];
2785     }
2786     return sum;
2787 }
2788 
2789 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2790                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2791 {
2792     intptr_t seg, opr_sz = simd_oprsz(desc);
2793 
2794     for (seg = 0; seg < opr_sz; seg += 16) {
2795         uint32_t *d = vd + seg;
2796         uint32_t *a = va + seg;
2797         uint32_t sum0, sum1, sum2, sum3;
2798 
2799         /*
2800          * Process the entire segment at once, writing back the
2801          * results only after we've consumed all of the inputs.
2802          *
2803          * Key to indices by column:
2804          *          i   j                  i             j
2805          */
2806         sum0 = a[H4(0 + 0)];
2807         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2808         sum1 = a[H4(0 + 1)];
2809         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2810         sum2 = a[H4(2 + 0)];
2811         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2812         sum3 = a[H4(2 + 1)];
2813         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2814 
2815         d[H4(0)] = sum0;
2816         d[H4(1)] = sum1;
2817         d[H4(2)] = sum2;
2818         d[H4(3)] = sum3;
2819     }
2820     clear_tail(vd, opr_sz, simd_maxsz(desc));
2821 }
2822 
2823 #define DO_MMLA_B(NAME, INNER) \
2824     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2825     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2826 
2827 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2828 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2829 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2830 
2831 /*
2832  * BFloat16 Dot Product
2833  */
2834 
2835 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2836 {
2837     /*
2838      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2839      * For EBF = 0, we ignore the FPCR bits which determine rounding
2840      * mode and denormal-flushing, and we do unfused multiplies and
2841      * additions with intermediate rounding of all products and sums.
2842      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2843      * and we perform a fused two-way sum-of-products without intermediate
2844      * rounding of the products.
2845      * In either case, we don't set fp exception flags.
2846      *
2847      * EBF is AArch64 only, so even if it's set in the FPCR it has
2848      * no effect on AArch32 instructions.
2849      */
2850     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2851 
2852     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2853     set_default_nan_mode(true, statusp);
2854 
2855     if (ebf) {
2856         /* EBF=1 needs to do a step with round-to-odd semantics */
2857         *oddstatusp = *statusp;
2858         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2859     } else {
2860         set_flush_to_zero(true, statusp);
2861         set_flush_inputs_to_zero(true, statusp);
2862         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2863     }
2864     return ebf;
2865 }
2866 
2867 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2868 {
2869     float32 t1, t2;
2870 
2871     /*
2872      * Extract each BFloat16 from the element pair, and shift
2873      * them such that they become float32.
2874      */
2875     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2876     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2877     t1 = float32_add(t1, t2, fpst);
2878     t1 = float32_add(sum, t1, fpst);
2879 
2880     return t1;
2881 }
2882 
2883 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2884                      float_status *fpst, float_status *fpst_odd)
2885 {
2886     /*
2887      * Compare f16_dotadd() in sme_helper.c, but here we have
2888      * bfloat16 inputs. In particular that means that we do not
2889      * want the FPCR.FZ16 flush semantics, so we use the normal
2890      * float_status for the input handling here.
2891      */
2892     float64 e1r = float32_to_float64(e1 << 16, fpst);
2893     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2894     float64 e2r = float32_to_float64(e2 << 16, fpst);
2895     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2896     float64 t64;
2897     float32 t32;
2898 
2899     /*
2900      * The ARM pseudocode function FPDot performs both multiplies
2901      * and the add with a single rounding operation.  Emulate this
2902      * by performing the first multiply in round-to-odd, then doing
2903      * the second multiply as fused multiply-add, and rounding to
2904      * float32 all in one step.
2905      */
2906     t64 = float64_mul(e1r, e2r, fpst_odd);
2907     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2908 
2909     /* This conversion is exact, because we've already rounded. */
2910     t32 = float64_to_float32(t64, fpst);
2911 
2912     /* The final accumulation step is not fused. */
2913     return float32_add(sum, t32, fpst);
2914 }
2915 
2916 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2917                         CPUARMState *env, uint32_t desc)
2918 {
2919     intptr_t i, opr_sz = simd_oprsz(desc);
2920     float32 *d = vd, *a = va;
2921     uint32_t *n = vn, *m = vm;
2922     float_status fpst, fpst_odd;
2923 
2924     if (is_ebf(env, &fpst, &fpst_odd)) {
2925         for (i = 0; i < opr_sz / 4; ++i) {
2926             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2927         }
2928     } else {
2929         for (i = 0; i < opr_sz / 4; ++i) {
2930             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2931         }
2932     }
2933     clear_tail(d, opr_sz, simd_maxsz(desc));
2934 }
2935 
2936 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2937                             void *va, CPUARMState *env, uint32_t desc)
2938 {
2939     intptr_t i, j, opr_sz = simd_oprsz(desc);
2940     intptr_t index = simd_data(desc);
2941     intptr_t elements = opr_sz / 4;
2942     intptr_t eltspersegment = MIN(16 / 4, elements);
2943     float32 *d = vd, *a = va;
2944     uint32_t *n = vn, *m = vm;
2945     float_status fpst, fpst_odd;
2946 
2947     if (is_ebf(env, &fpst, &fpst_odd)) {
2948         for (i = 0; i < elements; i += eltspersegment) {
2949             uint32_t m_idx = m[i + H4(index)];
2950 
2951             for (j = i; j < i + eltspersegment; j++) {
2952                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
2953             }
2954         }
2955     } else {
2956         for (i = 0; i < elements; i += eltspersegment) {
2957             uint32_t m_idx = m[i + H4(index)];
2958 
2959             for (j = i; j < i + eltspersegment; j++) {
2960                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
2961             }
2962         }
2963     }
2964     clear_tail(d, opr_sz, simd_maxsz(desc));
2965 }
2966 
2967 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
2968                          CPUARMState *env, uint32_t desc)
2969 {
2970     intptr_t s, opr_sz = simd_oprsz(desc);
2971     float32 *d = vd, *a = va;
2972     uint32_t *n = vn, *m = vm;
2973     float_status fpst, fpst_odd;
2974 
2975     if (is_ebf(env, &fpst, &fpst_odd)) {
2976         for (s = 0; s < opr_sz / 4; s += 4) {
2977             float32 sum00, sum01, sum10, sum11;
2978 
2979             /*
2980              * Process the entire segment at once, writing back the
2981              * results only after we've consumed all of the inputs.
2982              *
2983              * Key to indices by column:
2984              *               i   j               i   k             j   k
2985              */
2986             sum00 = a[s + H4(0 + 0)];
2987             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2988             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2989 
2990             sum01 = a[s + H4(0 + 1)];
2991             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
2992             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
2993 
2994             sum10 = a[s + H4(2 + 0)];
2995             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
2996             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
2997 
2998             sum11 = a[s + H4(2 + 1)];
2999             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3000             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3001 
3002             d[s + H4(0 + 0)] = sum00;
3003             d[s + H4(0 + 1)] = sum01;
3004             d[s + H4(2 + 0)] = sum10;
3005             d[s + H4(2 + 1)] = sum11;
3006         }
3007     } else {
3008         for (s = 0; s < opr_sz / 4; s += 4) {
3009             float32 sum00, sum01, sum10, sum11;
3010 
3011             /*
3012              * Process the entire segment at once, writing back the
3013              * results only after we've consumed all of the inputs.
3014              *
3015              * Key to indices by column:
3016              *               i   j           i   k             j   k
3017              */
3018             sum00 = a[s + H4(0 + 0)];
3019             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3020             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3021 
3022             sum01 = a[s + H4(0 + 1)];
3023             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3024             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3025 
3026             sum10 = a[s + H4(2 + 0)];
3027             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3028             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3029 
3030             sum11 = a[s + H4(2 + 1)];
3031             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3032             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3033 
3034             d[s + H4(0 + 0)] = sum00;
3035             d[s + H4(0 + 1)] = sum01;
3036             d[s + H4(2 + 0)] = sum10;
3037             d[s + H4(2 + 1)] = sum11;
3038         }
3039     }
3040     clear_tail(d, opr_sz, simd_maxsz(desc));
3041 }
3042 
3043 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3044                          float_status *stat, uint32_t desc)
3045 {
3046     intptr_t i, opr_sz = simd_oprsz(desc);
3047     intptr_t sel = simd_data(desc);
3048     float32 *d = vd, *a = va;
3049     bfloat16 *n = vn, *m = vm;
3050 
3051     for (i = 0; i < opr_sz / 4; ++i) {
3052         float32 nn = n[H2(i * 2 + sel)] << 16;
3053         float32 mm = m[H2(i * 2 + sel)] << 16;
3054         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3055     }
3056     clear_tail(d, opr_sz, simd_maxsz(desc));
3057 }
3058 
3059 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3060                              void *va, float_status *stat, uint32_t desc)
3061 {
3062     intptr_t i, j, opr_sz = simd_oprsz(desc);
3063     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3064     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3065     intptr_t elements = opr_sz / 4;
3066     intptr_t eltspersegment = MIN(16 / 4, elements);
3067     float32 *d = vd, *a = va;
3068     bfloat16 *n = vn, *m = vm;
3069 
3070     for (i = 0; i < elements; i += eltspersegment) {
3071         float32 m_idx = m[H2(2 * i + index)] << 16;
3072 
3073         for (j = i; j < i + eltspersegment; j++) {
3074             float32 n_j = n[H2(2 * j + sel)] << 16;
3075             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3076         }
3077     }
3078     clear_tail(d, opr_sz, simd_maxsz(desc));
3079 }
3080 
3081 #define DO_CLAMP(NAME, TYPE) \
3082 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3083 {                                                                       \
3084     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3085     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3086         TYPE aa = *(TYPE *)(a + i);                                     \
3087         TYPE nn = *(TYPE *)(n + i);                                     \
3088         TYPE mm = *(TYPE *)(m + i);                                     \
3089         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3090         *(TYPE *)(d + i) = dd;                                          \
3091     }                                                                   \
3092     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3093 }
3094 
3095 DO_CLAMP(gvec_sclamp_b, int8_t)
3096 DO_CLAMP(gvec_sclamp_h, int16_t)
3097 DO_CLAMP(gvec_sclamp_s, int32_t)
3098 DO_CLAMP(gvec_sclamp_d, int64_t)
3099 
3100 DO_CLAMP(gvec_uclamp_b, uint8_t)
3101 DO_CLAMP(gvec_uclamp_h, uint16_t)
3102 DO_CLAMP(gvec_uclamp_s, uint32_t)
3103 DO_CLAMP(gvec_uclamp_d, uint64_t)
3104 
3105 /* Bit count in each 8-bit word. */
3106 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3107 {
3108     intptr_t i, opr_sz = simd_oprsz(desc);
3109     uint8_t *d = vd, *n = vn;
3110 
3111     for (i = 0; i < opr_sz; ++i) {
3112         d[i] = ctpop8(n[i]);
3113     }
3114     clear_tail(d, opr_sz, simd_maxsz(desc));
3115 }
3116 
3117 /* Reverse bits in each 8 bit word */
3118 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3119 {
3120     intptr_t i, opr_sz = simd_oprsz(desc);
3121     uint64_t *d = vd, *n = vn;
3122 
3123     for (i = 0; i < opr_sz / 8; ++i) {
3124         d[i] = revbit64(bswap64(n[i]));
3125     }
3126     clear_tail(d, opr_sz, simd_maxsz(desc));
3127 }
3128 
3129 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3130 {
3131     intptr_t i, opr_sz = simd_oprsz(desc);
3132     uint32_t *d = vd, *n = vn;
3133 
3134     for (i = 0; i < opr_sz / 4; ++i) {
3135         d[i] = helper_recpe_u32(n[i]);
3136     }
3137     clear_tail(d, opr_sz, simd_maxsz(desc));
3138 }
3139 
3140 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3141 {
3142     intptr_t i, opr_sz = simd_oprsz(desc);
3143     uint32_t *d = vd, *n = vn;
3144 
3145     for (i = 0; i < opr_sz / 4; ++i) {
3146         d[i] = helper_rsqrte_u32(n[i]);
3147     }
3148     clear_tail(d, opr_sz, simd_maxsz(desc));
3149 }
3150