xref: /qemu/target/arm/tcg/vec_helper.c (revision a7868aaa30e4df6e6040fd96217e1febe247714a)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
874 
875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
876                          float_status *fpst, uint32_t desc)
877 {
878     uintptr_t opr_sz = simd_oprsz(desc);
879     float16 *d = vd;
880     float16 *n = vn;
881     float16 *m = vm;
882     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
883     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
884     uintptr_t i;
885 
886     for (i = 0; i < opr_sz / 2; i += 2) {
887         float16 e0 = n[H2(i)];
888         float16 e1 = m[H2(i + 1)];
889         float16 e2 = n[H2(i + 1)];
890         float16 e3 = m[H2(i)];
891 
892         if (rot) {
893             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
894         } else {
895             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
896         }
897 
898         d[H2(i)] = float16_add(e0, e1, fpst);
899         d[H2(i + 1)] = float16_add(e2, e3, fpst);
900     }
901     clear_tail(d, opr_sz, simd_maxsz(desc));
902 }
903 
904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
905                          float_status *fpst, uint32_t desc)
906 {
907     uintptr_t opr_sz = simd_oprsz(desc);
908     float32 *d = vd;
909     float32 *n = vn;
910     float32 *m = vm;
911     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
912     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
913     uintptr_t i;
914 
915     for (i = 0; i < opr_sz / 4; i += 2) {
916         float32 e0 = n[H4(i)];
917         float32 e1 = m[H4(i + 1)];
918         float32 e2 = n[H4(i + 1)];
919         float32 e3 = m[H4(i)];
920 
921         if (rot) {
922             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
923         } else {
924             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
925         }
926 
927         d[H4(i)] = float32_add(e0, e1, fpst);
928         d[H4(i + 1)] = float32_add(e2, e3, fpst);
929     }
930     clear_tail(d, opr_sz, simd_maxsz(desc));
931 }
932 
933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
934                          float_status *fpst, uint32_t desc)
935 {
936     uintptr_t opr_sz = simd_oprsz(desc);
937     float64 *d = vd;
938     float64 *n = vn;
939     float64 *m = vm;
940     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
941     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
942     uintptr_t i;
943 
944     for (i = 0; i < opr_sz / 8; i += 2) {
945         float64 e0 = n[i];
946         float64 e1 = m[i + 1];
947         float64 e2 = n[i + 1];
948         float64 e3 = m[i];
949 
950         if (rot) {
951             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
952         } else {
953             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
954         }
955 
956         d[i] = float64_add(e0, e1, fpst);
957         d[i + 1] = float64_add(e2, e3, fpst);
958     }
959     clear_tail(d, opr_sz, simd_maxsz(desc));
960 }
961 
962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
963                          float_status *fpst, uint32_t desc)
964 {
965     uintptr_t opr_sz = simd_oprsz(desc);
966     float16 *d = vd, *n = vn, *m = vm, *a = va;
967     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
968     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
969     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
970     uint32_t negf_real = flip ^ negf_imag;
971     float16 negx_imag, negx_real;
972     uintptr_t i;
973 
974     /* With AH=0, use negx; with AH=1 use negf. */
975     negx_real = (negf_real & ~fpcr_ah) << 15;
976     negx_imag = (negf_imag & ~fpcr_ah) << 15;
977     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
978     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
979 
980     for (i = 0; i < opr_sz / 2; i += 2) {
981         float16 e2 = n[H2(i + flip)];
982         float16 e1 = m[H2(i + flip)] ^ negx_real;
983         float16 e4 = e2;
984         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
985 
986         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
987         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
988     }
989     clear_tail(d, opr_sz, simd_maxsz(desc));
990 }
991 
992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
993                              float_status *fpst, uint32_t desc)
994 {
995     uintptr_t opr_sz = simd_oprsz(desc);
996     float16 *d = vd, *n = vn, *m = vm, *a = va;
997     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
998     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
999     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1000     uint32_t neg_real = flip ^ neg_imag;
1001     intptr_t elements = opr_sz / sizeof(float16);
1002     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1003     intptr_t i, j;
1004 
1005     /* Shift boolean to the sign bit so we can xor to negate.  */
1006     neg_real <<= 15;
1007     neg_imag <<= 15;
1008 
1009     for (i = 0; i < elements; i += eltspersegment) {
1010         float16 mr = m[H2(i + 2 * index + 0)];
1011         float16 mi = m[H2(i + 2 * index + 1)];
1012         float16 e1 = neg_real ^ (flip ? mi : mr);
1013         float16 e3 = neg_imag ^ (flip ? mr : mi);
1014 
1015         for (j = i; j < i + eltspersegment; j += 2) {
1016             float16 e2 = n[H2(j + flip)];
1017             float16 e4 = e2;
1018 
1019             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
1020             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
1021         }
1022     }
1023     clear_tail(d, opr_sz, simd_maxsz(desc));
1024 }
1025 
1026 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1027                          float_status *fpst, uint32_t desc)
1028 {
1029     uintptr_t opr_sz = simd_oprsz(desc);
1030     float32 *d = vd, *n = vn, *m = vm, *a = va;
1031     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1032     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1033     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1034     uint32_t negf_real = flip ^ negf_imag;
1035     float32 negx_imag, negx_real;
1036     uintptr_t i;
1037 
1038     /* With AH=0, use negx; with AH=1 use negf. */
1039     negx_real = (negf_real & ~fpcr_ah) << 31;
1040     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1041     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1042     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1043 
1044     for (i = 0; i < opr_sz / 4; i += 2) {
1045         float32 e2 = n[H4(i + flip)];
1046         float32 e1 = m[H4(i + flip)] ^ negx_real;
1047         float32 e4 = e2;
1048         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1049 
1050         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1051         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1052     }
1053     clear_tail(d, opr_sz, simd_maxsz(desc));
1054 }
1055 
1056 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1057                              float_status *fpst, uint32_t desc)
1058 {
1059     uintptr_t opr_sz = simd_oprsz(desc);
1060     float32 *d = vd, *n = vn, *m = vm, *a = va;
1061     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1062     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1063     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1064     uint32_t neg_real = flip ^ neg_imag;
1065     intptr_t elements = opr_sz / sizeof(float32);
1066     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1067     intptr_t i, j;
1068 
1069     /* Shift boolean to the sign bit so we can xor to negate.  */
1070     neg_real <<= 31;
1071     neg_imag <<= 31;
1072 
1073     for (i = 0; i < elements; i += eltspersegment) {
1074         float32 mr = m[H4(i + 2 * index + 0)];
1075         float32 mi = m[H4(i + 2 * index + 1)];
1076         float32 e1 = neg_real ^ (flip ? mi : mr);
1077         float32 e3 = neg_imag ^ (flip ? mr : mi);
1078 
1079         for (j = i; j < i + eltspersegment; j += 2) {
1080             float32 e2 = n[H4(j + flip)];
1081             float32 e4 = e2;
1082 
1083             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
1084             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
1085         }
1086     }
1087     clear_tail(d, opr_sz, simd_maxsz(desc));
1088 }
1089 
1090 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1091                          float_status *fpst, uint32_t desc)
1092 {
1093     uintptr_t opr_sz = simd_oprsz(desc);
1094     float64 *d = vd, *n = vn, *m = vm, *a = va;
1095     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1096     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1097     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1098     uint32_t negf_real = flip ^ negf_imag;
1099     float64 negx_real, negx_imag;
1100     uintptr_t i;
1101 
1102     /* With AH=0, use negx; with AH=1 use negf. */
1103     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1104     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1105     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1106     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1107 
1108     for (i = 0; i < opr_sz / 8; i += 2) {
1109         float64 e2 = n[i + flip];
1110         float64 e1 = m[i + flip] ^ negx_real;
1111         float64 e4 = e2;
1112         float64 e3 = m[i + 1 - flip] ^ negx_imag;
1113 
1114         d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1115         d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1116     }
1117     clear_tail(d, opr_sz, simd_maxsz(desc));
1118 }
1119 
1120 /*
1121  * Floating point comparisons producing an integer result (all 1s or all 0s).
1122  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1123  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1124  */
1125 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1126 {
1127     return -float16_eq_quiet(op1, op2, stat);
1128 }
1129 
1130 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1131 {
1132     return -float32_eq_quiet(op1, op2, stat);
1133 }
1134 
1135 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1136 {
1137     return -float64_eq_quiet(op1, op2, stat);
1138 }
1139 
1140 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1141 {
1142     return -float16_le(op2, op1, stat);
1143 }
1144 
1145 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1146 {
1147     return -float32_le(op2, op1, stat);
1148 }
1149 
1150 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1151 {
1152     return -float64_le(op2, op1, stat);
1153 }
1154 
1155 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1156 {
1157     return -float16_lt(op2, op1, stat);
1158 }
1159 
1160 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1161 {
1162     return -float32_lt(op2, op1, stat);
1163 }
1164 
1165 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1166 {
1167     return -float64_lt(op2, op1, stat);
1168 }
1169 
1170 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1171 {
1172     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1173 }
1174 
1175 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1176 {
1177     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1178 }
1179 
1180 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1181 {
1182     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1183 }
1184 
1185 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1186 {
1187     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1188 }
1189 
1190 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1191 {
1192     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1193 }
1194 
1195 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1196 {
1197     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1198 }
1199 
1200 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1201 {
1202     if (float16_is_any_nan(x)) {
1203         float_raise(float_flag_invalid, fpst);
1204         return 0;
1205     }
1206     return float16_to_int16_round_to_zero(x, fpst);
1207 }
1208 
1209 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1210 {
1211     if (float16_is_any_nan(x)) {
1212         float_raise(float_flag_invalid, fpst);
1213         return 0;
1214     }
1215     return float16_to_uint16_round_to_zero(x, fpst);
1216 }
1217 
1218 #define DO_2OP(NAME, FUNC, TYPE) \
1219 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1220 {                                                                 \
1221     intptr_t i, oprsz = simd_oprsz(desc);                         \
1222     TYPE *d = vd, *n = vn;                                        \
1223     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1224         d[i] = FUNC(n[i], stat);                                  \
1225     }                                                             \
1226     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1227 }
1228 
1229 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1230 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1231 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1232 
1233 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1234 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1235 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1236 
1237 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1238 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1239 
1240 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1241 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1242 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1243 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1244 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1245 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1246 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1247 DO_2OP(gvec_touszh, vfp_touszh, float16)
1248 
1249 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1250     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1251     {                                                           \
1252         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1253     }
1254 
1255 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1256     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1257     {                                                           \
1258         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1259     }
1260 
1261 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1262     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1263     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1264     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1265     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1266     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1267     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1268 
1269 DO_2OP_CMP0(cgt, cgt, FWD)
1270 DO_2OP_CMP0(cge, cge, FWD)
1271 DO_2OP_CMP0(ceq, ceq, FWD)
1272 DO_2OP_CMP0(clt, cgt, REV)
1273 DO_2OP_CMP0(cle, cge, REV)
1274 
1275 #undef DO_2OP
1276 #undef DO_2OP_CMP0
1277 
1278 /* Floating-point trigonometric starting value.
1279  * See the ARM ARM pseudocode function FPTrigSMul.
1280  */
1281 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1282 {
1283     float16 result = float16_mul(op1, op1, stat);
1284     if (!float16_is_any_nan(result)) {
1285         result = float16_set_sign(result, op2 & 1);
1286     }
1287     return result;
1288 }
1289 
1290 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1291 {
1292     float32 result = float32_mul(op1, op1, stat);
1293     if (!float32_is_any_nan(result)) {
1294         result = float32_set_sign(result, op2 & 1);
1295     }
1296     return result;
1297 }
1298 
1299 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1300 {
1301     float64 result = float64_mul(op1, op1, stat);
1302     if (!float64_is_any_nan(result)) {
1303         result = float64_set_sign(result, op2 & 1);
1304     }
1305     return result;
1306 }
1307 
1308 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1309 {
1310     return float16_abs(float16_sub(op1, op2, stat));
1311 }
1312 
1313 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1314 {
1315     return float32_abs(float32_sub(op1, op2, stat));
1316 }
1317 
1318 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1319 {
1320     return float64_abs(float64_sub(op1, op2, stat));
1321 }
1322 
1323 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1324 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1325 {
1326     float16 r = float16_sub(op1, op2, stat);
1327     return float16_is_any_nan(r) ? r : float16_abs(r);
1328 }
1329 
1330 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1331 {
1332     float32 r = float32_sub(op1, op2, stat);
1333     return float32_is_any_nan(r) ? r : float32_abs(r);
1334 }
1335 
1336 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1337 {
1338     float64 r = float64_sub(op1, op2, stat);
1339     return float64_is_any_nan(r) ? r : float64_abs(r);
1340 }
1341 
1342 /*
1343  * Reciprocal step. These are the AArch32 version which uses a
1344  * non-fused multiply-and-subtract.
1345  */
1346 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1347 {
1348     op1 = float16_squash_input_denormal(op1, stat);
1349     op2 = float16_squash_input_denormal(op2, stat);
1350 
1351     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1352         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1353         return float16_two;
1354     }
1355     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1356 }
1357 
1358 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1359 {
1360     op1 = float32_squash_input_denormal(op1, stat);
1361     op2 = float32_squash_input_denormal(op2, stat);
1362 
1363     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1364         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1365         return float32_two;
1366     }
1367     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1368 }
1369 
1370 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1371 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1372 {
1373     op1 = float16_squash_input_denormal(op1, stat);
1374     op2 = float16_squash_input_denormal(op2, stat);
1375 
1376     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1377         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1378         return float16_one_point_five;
1379     }
1380     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1381     return float16_div(op1, float16_two, stat);
1382 }
1383 
1384 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1385 {
1386     op1 = float32_squash_input_denormal(op1, stat);
1387     op2 = float32_squash_input_denormal(op2, stat);
1388 
1389     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1390         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1391         return float32_one_point_five;
1392     }
1393     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1394     return float32_div(op1, float32_two, stat);
1395 }
1396 
1397 #define DO_3OP(NAME, FUNC, TYPE) \
1398 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1399                   float_status *stat, uint32_t desc)                       \
1400 {                                                                          \
1401     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1402     TYPE *d = vd, *n = vn, *m = vm;                                        \
1403     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1404         d[i] = FUNC(n[i], m[i], stat);                                     \
1405     }                                                                      \
1406     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1407 }
1408 
1409 DO_3OP(gvec_fadd_h, float16_add, float16)
1410 DO_3OP(gvec_fadd_s, float32_add, float32)
1411 DO_3OP(gvec_fadd_d, float64_add, float64)
1412 
1413 DO_3OP(gvec_fsub_h, float16_sub, float16)
1414 DO_3OP(gvec_fsub_s, float32_sub, float32)
1415 DO_3OP(gvec_fsub_d, float64_sub, float64)
1416 
1417 DO_3OP(gvec_fmul_h, float16_mul, float16)
1418 DO_3OP(gvec_fmul_s, float32_mul, float32)
1419 DO_3OP(gvec_fmul_d, float64_mul, float64)
1420 
1421 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1422 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1423 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1424 
1425 DO_3OP(gvec_fabd_h, float16_abd, float16)
1426 DO_3OP(gvec_fabd_s, float32_abd, float32)
1427 DO_3OP(gvec_fabd_d, float64_abd, float64)
1428 
1429 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1430 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1431 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1432 
1433 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1434 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1435 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1436 
1437 DO_3OP(gvec_fcge_h, float16_cge, float16)
1438 DO_3OP(gvec_fcge_s, float32_cge, float32)
1439 DO_3OP(gvec_fcge_d, float64_cge, float64)
1440 
1441 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1442 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1443 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1444 
1445 DO_3OP(gvec_facge_h, float16_acge, float16)
1446 DO_3OP(gvec_facge_s, float32_acge, float32)
1447 DO_3OP(gvec_facge_d, float64_acge, float64)
1448 
1449 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1450 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1451 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1452 
1453 DO_3OP(gvec_fmax_h, float16_max, float16)
1454 DO_3OP(gvec_fmax_s, float32_max, float32)
1455 DO_3OP(gvec_fmax_d, float64_max, float64)
1456 
1457 DO_3OP(gvec_fmin_h, float16_min, float16)
1458 DO_3OP(gvec_fmin_s, float32_min, float32)
1459 DO_3OP(gvec_fmin_d, float64_min, float64)
1460 
1461 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1462 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1463 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1464 
1465 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1466 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1467 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1468 
1469 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1470 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1471 
1472 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1473 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1474 
1475 #ifdef TARGET_AARCH64
1476 DO_3OP(gvec_fdiv_h, float16_div, float16)
1477 DO_3OP(gvec_fdiv_s, float32_div, float32)
1478 DO_3OP(gvec_fdiv_d, float64_div, float64)
1479 
1480 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1481 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1482 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1483 
1484 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1485 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1486 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1487 
1488 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1489 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1490 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1491 
1492 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1493 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1494 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1495 
1496 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1497 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1498 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1499 
1500 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1501 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1502 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1503 
1504 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1505 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1506 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1507 
1508 #endif
1509 #undef DO_3OP
1510 
1511 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1512 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1513                                  float_status *stat)
1514 {
1515     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1516 }
1517 
1518 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1519                                  float_status *stat)
1520 {
1521     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1522 }
1523 
1524 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1525                                  float_status *stat)
1526 {
1527     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1528 }
1529 
1530 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1531                                  float_status *stat)
1532 {
1533     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1534 }
1535 
1536 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1537 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1538                                 float_status *stat)
1539 {
1540     return float16_muladd(op1, op2, dest, 0, stat);
1541 }
1542 
1543 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1544                                  float_status *stat)
1545 {
1546     return float32_muladd(op1, op2, dest, 0, stat);
1547 }
1548 
1549 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1550                                  float_status *stat)
1551 {
1552     return float64_muladd(op1, op2, dest, 0, stat);
1553 }
1554 
1555 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1556                                  float_status *stat)
1557 {
1558     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1559 }
1560 
1561 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1562                                  float_status *stat)
1563 {
1564     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1565 }
1566 
1567 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1568                                  float_status *stat)
1569 {
1570     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1571 }
1572 
1573 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1574                                  float_status *stat)
1575 {
1576     return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1577 }
1578 
1579 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1580                                  float_status *stat)
1581 {
1582     return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1583 }
1584 
1585 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1586                                  float_status *stat)
1587 {
1588     return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1589 }
1590 
1591 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1592 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1593                   float_status *stat, uint32_t desc)                       \
1594 {                                                                          \
1595     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1596     TYPE *d = vd, *n = vn, *m = vm;                                        \
1597     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1598         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1599     }                                                                      \
1600     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1601 }
1602 
1603 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1604 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1605 
1606 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1607 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1608 
1609 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1610 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1611 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1612 
1613 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1614 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1615 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1616 
1617 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1618 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1619 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1620 
1621 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1622  * For AdvSIMD, there is of course only one such vector segment.
1623  */
1624 
1625 #define DO_MUL_IDX(NAME, TYPE, H) \
1626 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1627 {                                                                          \
1628     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1629     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1630     intptr_t idx = simd_data(desc);                                        \
1631     TYPE *d = vd, *n = vn, *m = vm;                                        \
1632     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1633         TYPE mm = m[H(i + idx)];                                           \
1634         for (j = 0; j < segment; j++) {                                    \
1635             d[i + j] = n[i + j] * mm;                                      \
1636         }                                                                  \
1637     }                                                                      \
1638     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1639 }
1640 
1641 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1642 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1643 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1644 
1645 #undef DO_MUL_IDX
1646 
1647 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1648 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1649 {                                                                          \
1650     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1651     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1652     intptr_t idx = simd_data(desc);                                        \
1653     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1654     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1655         TYPE mm = m[H(i + idx)];                                           \
1656         for (j = 0; j < segment; j++) {                                    \
1657             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1658         }                                                                  \
1659     }                                                                      \
1660     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1661 }
1662 
1663 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1664 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1665 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1666 
1667 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1668 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1669 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1670 
1671 #undef DO_MLA_IDX
1672 
1673 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1674 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1675                   float_status *stat, uint32_t desc)                       \
1676 {                                                                          \
1677     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1678     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1679     intptr_t idx = simd_data(desc);                                        \
1680     TYPE *d = vd, *n = vn, *m = vm;                                        \
1681     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1682         TYPE mm = m[H(i + idx)];                                           \
1683         for (j = 0; j < segment; j++) {                                    \
1684             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1685         }                                                                  \
1686     }                                                                      \
1687     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1688 }
1689 
1690 #define nop(N, M, S) (M)
1691 
1692 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1693 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1694 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1695 
1696 #ifdef TARGET_AARCH64
1697 
1698 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1699 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1700 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1701 
1702 #endif
1703 
1704 #undef nop
1705 
1706 /*
1707  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1708  * the fused ops below they assume accumulate both from and into Vd.
1709  */
1710 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1711 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1712 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1713 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1714 
1715 #undef DO_FMUL_IDX
1716 
1717 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF)                             \
1718 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1719                   float_status *stat, uint32_t desc)                       \
1720 {                                                                          \
1721     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1722     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1723     intptr_t idx = simd_data(desc);                                        \
1724     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1725     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1726         TYPE mm = m[H(i + idx)];                                           \
1727         for (j = 0; j < segment; j++) {                                    \
1728             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1729                                      a[i + j], NEGF, stat);                \
1730         }                                                                  \
1731     }                                                                      \
1732     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1733 }
1734 
1735 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1736 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1737 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1738 
1739 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1740 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1741 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1742 
1743 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1744 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1745 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1746 
1747 #undef DO_FMLA_IDX
1748 
1749 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1750 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1751 {                                                                          \
1752     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1753     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1754     bool q = false;                                                        \
1755     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1756         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1757         if (dd < MIN) {                                                    \
1758             dd = MIN;                                                      \
1759             q = true;                                                      \
1760         } else if (dd > MAX) {                                             \
1761             dd = MAX;                                                      \
1762             q = true;                                                      \
1763         }                                                                  \
1764         d[i] = dd;                                                         \
1765     }                                                                      \
1766     if (q) {                                                               \
1767         uint32_t *qc = vq;                                                 \
1768         qc[0] = 1;                                                         \
1769     }                                                                      \
1770     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1771 }
1772 
1773 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1774 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1775 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1776 
1777 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1778 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1779 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1780 
1781 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1782 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1783 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1784 
1785 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1786 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1787 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1788 
1789 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1790 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1791 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1792 
1793 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1794 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1795 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1796 
1797 #undef DO_SAT
1798 
1799 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1800                           void *vm, uint32_t desc)
1801 {
1802     intptr_t i, oprsz = simd_oprsz(desc);
1803     uint64_t *d = vd, *n = vn, *m = vm;
1804     bool q = false;
1805 
1806     for (i = 0; i < oprsz / 8; i++) {
1807         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1808         if (dd < nn) {
1809             dd = UINT64_MAX;
1810             q = true;
1811         }
1812         d[i] = dd;
1813     }
1814     if (q) {
1815         uint32_t *qc = vq;
1816         qc[0] = 1;
1817     }
1818     clear_tail(d, oprsz, simd_maxsz(desc));
1819 }
1820 
1821 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1822                           void *vm, uint32_t desc)
1823 {
1824     intptr_t i, oprsz = simd_oprsz(desc);
1825     uint64_t *d = vd, *n = vn, *m = vm;
1826     bool q = false;
1827 
1828     for (i = 0; i < oprsz / 8; i++) {
1829         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1830         if (nn < mm) {
1831             dd = 0;
1832             q = true;
1833         }
1834         d[i] = dd;
1835     }
1836     if (q) {
1837         uint32_t *qc = vq;
1838         qc[0] = 1;
1839     }
1840     clear_tail(d, oprsz, simd_maxsz(desc));
1841 }
1842 
1843 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1844                           void *vm, uint32_t desc)
1845 {
1846     intptr_t i, oprsz = simd_oprsz(desc);
1847     int64_t *d = vd, *n = vn, *m = vm;
1848     bool q = false;
1849 
1850     for (i = 0; i < oprsz / 8; i++) {
1851         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1852         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1853             dd = (nn >> 63) ^ ~INT64_MIN;
1854             q = true;
1855         }
1856         d[i] = dd;
1857     }
1858     if (q) {
1859         uint32_t *qc = vq;
1860         qc[0] = 1;
1861     }
1862     clear_tail(d, oprsz, simd_maxsz(desc));
1863 }
1864 
1865 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1866                           void *vm, uint32_t desc)
1867 {
1868     intptr_t i, oprsz = simd_oprsz(desc);
1869     int64_t *d = vd, *n = vn, *m = vm;
1870     bool q = false;
1871 
1872     for (i = 0; i < oprsz / 8; i++) {
1873         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1874         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1875             dd = (nn >> 63) ^ ~INT64_MIN;
1876             q = true;
1877         }
1878         d[i] = dd;
1879     }
1880     if (q) {
1881         uint32_t *qc = vq;
1882         qc[0] = 1;
1883     }
1884     clear_tail(d, oprsz, simd_maxsz(desc));
1885 }
1886 
1887 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1888                            void *vm, uint32_t desc)
1889 {
1890     intptr_t i, oprsz = simd_oprsz(desc);
1891     uint64_t *d = vd, *n = vn, *m = vm;
1892     bool q = false;
1893 
1894     for (i = 0; i < oprsz / 8; i++) {
1895         uint64_t nn = n[i];
1896         int64_t mm = m[i];
1897         uint64_t dd = nn + mm;
1898 
1899         if (mm < 0) {
1900             if (nn < (uint64_t)-mm) {
1901                 dd = 0;
1902                 q = true;
1903             }
1904         } else {
1905             if (dd < nn) {
1906                 dd = UINT64_MAX;
1907                 q = true;
1908             }
1909         }
1910         d[i] = dd;
1911     }
1912     if (q) {
1913         uint32_t *qc = vq;
1914         qc[0] = 1;
1915     }
1916     clear_tail(d, oprsz, simd_maxsz(desc));
1917 }
1918 
1919 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
1920                            void *vm, uint32_t desc)
1921 {
1922     intptr_t i, oprsz = simd_oprsz(desc);
1923     uint64_t *d = vd, *n = vn, *m = vm;
1924     bool q = false;
1925 
1926     for (i = 0; i < oprsz / 8; i++) {
1927         int64_t nn = n[i];
1928         uint64_t mm = m[i];
1929         int64_t dd = nn + mm;
1930 
1931         if (mm > (uint64_t)(INT64_MAX - nn)) {
1932             dd = INT64_MAX;
1933             q = true;
1934         }
1935         d[i] = dd;
1936     }
1937     if (q) {
1938         uint32_t *qc = vq;
1939         qc[0] = 1;
1940     }
1941     clear_tail(d, oprsz, simd_maxsz(desc));
1942 }
1943 
1944 #define DO_SRA(NAME, TYPE)                              \
1945 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1946 {                                                       \
1947     intptr_t i, oprsz = simd_oprsz(desc);               \
1948     int shift = simd_data(desc);                        \
1949     TYPE *d = vd, *n = vn;                              \
1950     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1951         d[i] += n[i] >> shift;                          \
1952     }                                                   \
1953     clear_tail(d, oprsz, simd_maxsz(desc));             \
1954 }
1955 
1956 DO_SRA(gvec_ssra_b, int8_t)
1957 DO_SRA(gvec_ssra_h, int16_t)
1958 DO_SRA(gvec_ssra_s, int32_t)
1959 DO_SRA(gvec_ssra_d, int64_t)
1960 
1961 DO_SRA(gvec_usra_b, uint8_t)
1962 DO_SRA(gvec_usra_h, uint16_t)
1963 DO_SRA(gvec_usra_s, uint32_t)
1964 DO_SRA(gvec_usra_d, uint64_t)
1965 
1966 #undef DO_SRA
1967 
1968 #define DO_RSHR(NAME, TYPE)                             \
1969 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1970 {                                                       \
1971     intptr_t i, oprsz = simd_oprsz(desc);               \
1972     int shift = simd_data(desc);                        \
1973     TYPE *d = vd, *n = vn;                              \
1974     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1975         TYPE tmp = n[i] >> (shift - 1);                 \
1976         d[i] = (tmp >> 1) + (tmp & 1);                  \
1977     }                                                   \
1978     clear_tail(d, oprsz, simd_maxsz(desc));             \
1979 }
1980 
1981 DO_RSHR(gvec_srshr_b, int8_t)
1982 DO_RSHR(gvec_srshr_h, int16_t)
1983 DO_RSHR(gvec_srshr_s, int32_t)
1984 DO_RSHR(gvec_srshr_d, int64_t)
1985 
1986 DO_RSHR(gvec_urshr_b, uint8_t)
1987 DO_RSHR(gvec_urshr_h, uint16_t)
1988 DO_RSHR(gvec_urshr_s, uint32_t)
1989 DO_RSHR(gvec_urshr_d, uint64_t)
1990 
1991 #undef DO_RSHR
1992 
1993 #define DO_RSRA(NAME, TYPE)                             \
1994 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1995 {                                                       \
1996     intptr_t i, oprsz = simd_oprsz(desc);               \
1997     int shift = simd_data(desc);                        \
1998     TYPE *d = vd, *n = vn;                              \
1999     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2000         TYPE tmp = n[i] >> (shift - 1);                 \
2001         d[i] += (tmp >> 1) + (tmp & 1);                 \
2002     }                                                   \
2003     clear_tail(d, oprsz, simd_maxsz(desc));             \
2004 }
2005 
2006 DO_RSRA(gvec_srsra_b, int8_t)
2007 DO_RSRA(gvec_srsra_h, int16_t)
2008 DO_RSRA(gvec_srsra_s, int32_t)
2009 DO_RSRA(gvec_srsra_d, int64_t)
2010 
2011 DO_RSRA(gvec_ursra_b, uint8_t)
2012 DO_RSRA(gvec_ursra_h, uint16_t)
2013 DO_RSRA(gvec_ursra_s, uint32_t)
2014 DO_RSRA(gvec_ursra_d, uint64_t)
2015 
2016 #undef DO_RSRA
2017 
2018 #define DO_SRI(NAME, TYPE)                              \
2019 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2020 {                                                       \
2021     intptr_t i, oprsz = simd_oprsz(desc);               \
2022     int shift = simd_data(desc);                        \
2023     TYPE *d = vd, *n = vn;                              \
2024     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2025         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2026     }                                                   \
2027     clear_tail(d, oprsz, simd_maxsz(desc));             \
2028 }
2029 
2030 DO_SRI(gvec_sri_b, uint8_t)
2031 DO_SRI(gvec_sri_h, uint16_t)
2032 DO_SRI(gvec_sri_s, uint32_t)
2033 DO_SRI(gvec_sri_d, uint64_t)
2034 
2035 #undef DO_SRI
2036 
2037 #define DO_SLI(NAME, TYPE)                              \
2038 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2039 {                                                       \
2040     intptr_t i, oprsz = simd_oprsz(desc);               \
2041     int shift = simd_data(desc);                        \
2042     TYPE *d = vd, *n = vn;                              \
2043     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2044         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2045     }                                                   \
2046     clear_tail(d, oprsz, simd_maxsz(desc));             \
2047 }
2048 
2049 DO_SLI(gvec_sli_b, uint8_t)
2050 DO_SLI(gvec_sli_h, uint16_t)
2051 DO_SLI(gvec_sli_s, uint32_t)
2052 DO_SLI(gvec_sli_d, uint64_t)
2053 
2054 #undef DO_SLI
2055 
2056 /*
2057  * Convert float16 to float32, raising no exceptions and
2058  * preserving exceptional values, including SNaN.
2059  * This is effectively an unpack+repack operation.
2060  */
2061 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2062 {
2063     const int f16_bias = 15;
2064     const int f32_bias = 127;
2065     uint32_t sign = extract32(f16, 15, 1);
2066     uint32_t exp = extract32(f16, 10, 5);
2067     uint32_t frac = extract32(f16, 0, 10);
2068 
2069     if (exp == 0x1f) {
2070         /* Inf or NaN */
2071         exp = 0xff;
2072     } else if (exp == 0) {
2073         /* Zero or denormal.  */
2074         if (frac != 0) {
2075             if (fz16) {
2076                 frac = 0;
2077             } else {
2078                 /*
2079                  * Denormal; these are all normal float32.
2080                  * Shift the fraction so that the msb is at bit 11,
2081                  * then remove bit 11 as the implicit bit of the
2082                  * normalized float32.  Note that we still go through
2083                  * the shift for normal numbers below, to put the
2084                  * float32 fraction at the right place.
2085                  */
2086                 int shift = clz32(frac) - 21;
2087                 frac = (frac << shift) & 0x3ff;
2088                 exp = f32_bias - f16_bias - shift + 1;
2089             }
2090         }
2091     } else {
2092         /* Normal number; adjust the bias.  */
2093         exp += f32_bias - f16_bias;
2094     }
2095     sign <<= 31;
2096     exp <<= 23;
2097     frac <<= 23 - 10;
2098 
2099     return sign | exp | frac;
2100 }
2101 
2102 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2103 {
2104     /*
2105      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2106      * Load the 2nd qword iff is_q & is_2.
2107      * Shift to the 2nd dword iff !is_q & is_2.
2108      * For !is_q & !is_2, the upper bits of the result are garbage.
2109      */
2110     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2111 }
2112 
2113 /*
2114  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2115  * as there is not yet SVE versions that might use blocking.
2116  */
2117 
2118 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
2119                      uint32_t desc, bool fz16)
2120 {
2121     intptr_t i, oprsz = simd_oprsz(desc);
2122     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2123     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2124     int is_q = oprsz == 16;
2125     uint64_t n_4, m_4;
2126 
2127     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2128     n_4 = load4_f16(vn, is_q, is_2);
2129     m_4 = load4_f16(vm, is_q, is_2);
2130 
2131     /* Negate all inputs for FMLSL at once.  */
2132     if (is_s) {
2133         n_4 ^= 0x8000800080008000ull;
2134     }
2135 
2136     for (i = 0; i < oprsz / 4; i++) {
2137         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2138         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2139         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2140     }
2141     clear_tail(d, oprsz, simd_maxsz(desc));
2142 }
2143 
2144 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2145                             CPUARMState *env, uint32_t desc)
2146 {
2147     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2148              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2149 }
2150 
2151 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2152                             CPUARMState *env, uint32_t desc)
2153 {
2154     do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2155              get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2156 }
2157 
2158 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2159                                CPUARMState *env, uint32_t desc)
2160 {
2161     intptr_t i, oprsz = simd_oprsz(desc);
2162     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2163     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2164     float_status *status = &env->vfp.fp_status_a64;
2165     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2166 
2167     for (i = 0; i < oprsz; i += sizeof(float32)) {
2168         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
2169         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2170         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2171         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2172         float32 aa = *(float32 *)(va + H1_4(i));
2173 
2174         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
2175     }
2176 }
2177 
2178 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
2179                          uint32_t desc, bool fz16)
2180 {
2181     intptr_t i, oprsz = simd_oprsz(desc);
2182     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2183     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2184     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2185     int is_q = oprsz == 16;
2186     uint64_t n_4;
2187     float32 m_1;
2188 
2189     /* Pre-load all of the f16 data, avoiding overlap issues.  */
2190     n_4 = load4_f16(vn, is_q, is_2);
2191 
2192     /* Negate all inputs for FMLSL at once.  */
2193     if (is_s) {
2194         n_4 ^= 0x8000800080008000ull;
2195     }
2196 
2197     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2198 
2199     for (i = 0; i < oprsz / 4; i++) {
2200         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2201         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
2202     }
2203     clear_tail(d, oprsz, simd_maxsz(desc));
2204 }
2205 
2206 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2207                                 CPUARMState *env, uint32_t desc)
2208 {
2209     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
2210                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32));
2211 }
2212 
2213 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2214                                 CPUARMState *env, uint32_t desc)
2215 {
2216     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc,
2217                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64));
2218 }
2219 
2220 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2221                                CPUARMState *env, uint32_t desc)
2222 {
2223     intptr_t i, j, oprsz = simd_oprsz(desc);
2224     uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
2225     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2226     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
2227     float_status *status = &env->vfp.fp_status_a64;
2228     bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64);
2229 
2230     for (i = 0; i < oprsz; i += 16) {
2231         float16 mm_16 = *(float16 *)(vm + i + idx);
2232         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2233 
2234         for (j = 0; j < 16; j += sizeof(float32)) {
2235             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
2236             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2237             float32 aa = *(float32 *)(va + H1_4(i + j));
2238 
2239             *(float32 *)(vd + H1_4(i + j)) =
2240                 float32_muladd(nn, mm, aa, 0, status);
2241         }
2242     }
2243 }
2244 
2245 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2246 {
2247     intptr_t i, opr_sz = simd_oprsz(desc);
2248     int8_t *d = vd, *n = vn, *m = vm;
2249 
2250     for (i = 0; i < opr_sz; ++i) {
2251         int8_t mm = m[i];
2252         int8_t nn = n[i];
2253         int8_t res = 0;
2254         if (mm >= 0) {
2255             if (mm < 8) {
2256                 res = nn << mm;
2257             }
2258         } else {
2259             res = nn >> (mm > -8 ? -mm : 7);
2260         }
2261         d[i] = res;
2262     }
2263     clear_tail(d, opr_sz, simd_maxsz(desc));
2264 }
2265 
2266 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2267 {
2268     intptr_t i, opr_sz = simd_oprsz(desc);
2269     int16_t *d = vd, *n = vn, *m = vm;
2270 
2271     for (i = 0; i < opr_sz / 2; ++i) {
2272         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2273         int16_t nn = n[i];
2274         int16_t res = 0;
2275         if (mm >= 0) {
2276             if (mm < 16) {
2277                 res = nn << mm;
2278             }
2279         } else {
2280             res = nn >> (mm > -16 ? -mm : 15);
2281         }
2282         d[i] = res;
2283     }
2284     clear_tail(d, opr_sz, simd_maxsz(desc));
2285 }
2286 
2287 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2288 {
2289     intptr_t i, opr_sz = simd_oprsz(desc);
2290     uint8_t *d = vd, *n = vn, *m = vm;
2291 
2292     for (i = 0; i < opr_sz; ++i) {
2293         int8_t mm = m[i];
2294         uint8_t nn = n[i];
2295         uint8_t res = 0;
2296         if (mm >= 0) {
2297             if (mm < 8) {
2298                 res = nn << mm;
2299             }
2300         } else {
2301             if (mm > -8) {
2302                 res = nn >> -mm;
2303             }
2304         }
2305         d[i] = res;
2306     }
2307     clear_tail(d, opr_sz, simd_maxsz(desc));
2308 }
2309 
2310 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2311 {
2312     intptr_t i, opr_sz = simd_oprsz(desc);
2313     uint16_t *d = vd, *n = vn, *m = vm;
2314 
2315     for (i = 0; i < opr_sz / 2; ++i) {
2316         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2317         uint16_t nn = n[i];
2318         uint16_t res = 0;
2319         if (mm >= 0) {
2320             if (mm < 16) {
2321                 res = nn << mm;
2322             }
2323         } else {
2324             if (mm > -16) {
2325                 res = nn >> -mm;
2326             }
2327         }
2328         d[i] = res;
2329     }
2330     clear_tail(d, opr_sz, simd_maxsz(desc));
2331 }
2332 
2333 /*
2334  * 8x8->8 polynomial multiply.
2335  *
2336  * Polynomial multiplication is like integer multiplication except the
2337  * partial products are XORed, not added.
2338  *
2339  * TODO: expose this as a generic vector operation, as it is a common
2340  * crypto building block.
2341  */
2342 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2343 {
2344     intptr_t i, opr_sz = simd_oprsz(desc);
2345     uint64_t *d = vd, *n = vn, *m = vm;
2346 
2347     for (i = 0; i < opr_sz / 8; ++i) {
2348         d[i] = clmul_8x8_low(n[i], m[i]);
2349     }
2350     clear_tail(d, opr_sz, simd_maxsz(desc));
2351 }
2352 
2353 /*
2354  * 64x64->128 polynomial multiply.
2355  * Because of the lanes are not accessed in strict columns,
2356  * this probably cannot be turned into a generic helper.
2357  */
2358 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2359 {
2360     intptr_t i, opr_sz = simd_oprsz(desc);
2361     intptr_t hi = simd_data(desc);
2362     uint64_t *d = vd, *n = vn, *m = vm;
2363 
2364     for (i = 0; i < opr_sz / 8; i += 2) {
2365         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2366         d[i] = int128_getlo(r);
2367         d[i + 1] = int128_gethi(r);
2368     }
2369     clear_tail(d, opr_sz, simd_maxsz(desc));
2370 }
2371 
2372 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2373 {
2374     int hi = simd_data(desc);
2375     uint64_t *d = vd, *n = vn, *m = vm;
2376     uint64_t nn = n[hi], mm = m[hi];
2377 
2378     d[0] = clmul_8x4_packed(nn, mm);
2379     nn >>= 32;
2380     mm >>= 32;
2381     d[1] = clmul_8x4_packed(nn, mm);
2382 
2383     clear_tail(d, 16, simd_maxsz(desc));
2384 }
2385 
2386 #ifdef TARGET_AARCH64
2387 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2388 {
2389     int shift = simd_data(desc) * 8;
2390     intptr_t i, opr_sz = simd_oprsz(desc);
2391     uint64_t *d = vd, *n = vn, *m = vm;
2392 
2393     for (i = 0; i < opr_sz / 8; ++i) {
2394         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2395     }
2396 }
2397 
2398 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2399 {
2400     intptr_t sel = H4(simd_data(desc));
2401     intptr_t i, opr_sz = simd_oprsz(desc);
2402     uint32_t *n = vn, *m = vm;
2403     uint64_t *d = vd;
2404 
2405     for (i = 0; i < opr_sz / 8; ++i) {
2406         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2407     }
2408 }
2409 #endif
2410 
2411 #define DO_CMP0(NAME, TYPE, OP)                         \
2412 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2413 {                                                       \
2414     intptr_t i, opr_sz = simd_oprsz(desc);              \
2415     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2416         TYPE nn = *(TYPE *)(vn + i);                    \
2417         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2418     }                                                   \
2419     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2420 }
2421 
2422 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2423 DO_CMP0(gvec_clt0_b, int8_t, <)
2424 DO_CMP0(gvec_cle0_b, int8_t, <=)
2425 DO_CMP0(gvec_cgt0_b, int8_t, >)
2426 DO_CMP0(gvec_cge0_b, int8_t, >=)
2427 
2428 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2429 DO_CMP0(gvec_clt0_h, int16_t, <)
2430 DO_CMP0(gvec_cle0_h, int16_t, <=)
2431 DO_CMP0(gvec_cgt0_h, int16_t, >)
2432 DO_CMP0(gvec_cge0_h, int16_t, >=)
2433 
2434 #undef DO_CMP0
2435 
2436 #define DO_ABD(NAME, TYPE)                                      \
2437 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2438 {                                                               \
2439     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2440     TYPE *d = vd, *n = vn, *m = vm;                             \
2441                                                                 \
2442     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2443         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2444     }                                                           \
2445     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2446 }
2447 
2448 DO_ABD(gvec_sabd_b, int8_t)
2449 DO_ABD(gvec_sabd_h, int16_t)
2450 DO_ABD(gvec_sabd_s, int32_t)
2451 DO_ABD(gvec_sabd_d, int64_t)
2452 
2453 DO_ABD(gvec_uabd_b, uint8_t)
2454 DO_ABD(gvec_uabd_h, uint16_t)
2455 DO_ABD(gvec_uabd_s, uint32_t)
2456 DO_ABD(gvec_uabd_d, uint64_t)
2457 
2458 #undef DO_ABD
2459 
2460 #define DO_ABA(NAME, TYPE)                                      \
2461 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2462 {                                                               \
2463     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2464     TYPE *d = vd, *n = vn, *m = vm;                             \
2465                                                                 \
2466     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2467         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2468     }                                                           \
2469     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2470 }
2471 
2472 DO_ABA(gvec_saba_b, int8_t)
2473 DO_ABA(gvec_saba_h, int16_t)
2474 DO_ABA(gvec_saba_s, int32_t)
2475 DO_ABA(gvec_saba_d, int64_t)
2476 
2477 DO_ABA(gvec_uaba_b, uint8_t)
2478 DO_ABA(gvec_uaba_h, uint16_t)
2479 DO_ABA(gvec_uaba_s, uint32_t)
2480 DO_ABA(gvec_uaba_d, uint64_t)
2481 
2482 #undef DO_ABA
2483 
2484 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2485 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2486                   float_status *stat, uint32_t desc)                       \
2487 {                                                                          \
2488     ARMVectorReg scratch;                                                  \
2489     intptr_t oprsz = simd_oprsz(desc);                                     \
2490     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2491     TYPE *d = vd, *n = vn, *m = vm;                                        \
2492     if (unlikely(d == m)) {                                                \
2493         m = memcpy(&scratch, m, oprsz);                                    \
2494     }                                                                      \
2495     for (intptr_t i = 0; i < half; ++i) {                                  \
2496         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2497     }                                                                      \
2498     for (intptr_t i = 0; i < half; ++i) {                                  \
2499         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2500     }                                                                      \
2501     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2502 }
2503 
2504 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2505 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2506 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2507 
2508 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2509 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2510 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2511 
2512 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2513 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2514 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2515 
2516 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2517 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2518 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2519 
2520 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2521 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2522 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2523 
2524 #ifdef TARGET_AARCH64
2525 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2526 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2527 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2528 
2529 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2530 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2531 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2532 #endif
2533 
2534 #undef DO_3OP_PAIR
2535 
2536 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2537 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2538 {                                                               \
2539     ARMVectorReg scratch;                                       \
2540     intptr_t oprsz = simd_oprsz(desc);                          \
2541     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2542     TYPE *d = vd, *n = vn, *m = vm;                             \
2543     if (unlikely(d == m)) {                                     \
2544         m = memcpy(&scratch, m, oprsz);                         \
2545     }                                                           \
2546     for (intptr_t i = 0; i < half; ++i) {                       \
2547         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2548     }                                                           \
2549     for (intptr_t i = 0; i < half; ++i) {                       \
2550         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2551     }                                                           \
2552     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2553 }
2554 
2555 #define ADD(A, B) (A + B)
2556 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2557 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2558 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2559 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2560 #undef  ADD
2561 
2562 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2563 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2564 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2565 
2566 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2567 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2568 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2569 
2570 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2571 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2572 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2573 
2574 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2575 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2576 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2577 
2578 #undef DO_3OP_PAIR
2579 
2580 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2581     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2582     {                                                                   \
2583         intptr_t i, oprsz = simd_oprsz(desc);                           \
2584         int shift = simd_data(desc);                                    \
2585         TYPE *d = vd, *n = vn;                                          \
2586         float_status *fpst = stat;                                      \
2587         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2588             d[i] = FUNC(n[i], shift, fpst);                             \
2589         }                                                               \
2590         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2591     }
2592 
2593 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2594 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2595 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2596 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2597 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2598 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2599 
2600 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2601 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2602 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2603 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2604 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2605 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2606 
2607 #undef DO_VCVT_FIXED
2608 
2609 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2610     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2611     {                                                                   \
2612         intptr_t i, oprsz = simd_oprsz(desc);                           \
2613         uint32_t rmode = simd_data(desc);                               \
2614         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2615         TYPE *d = vd, *n = vn;                                          \
2616         set_float_rounding_mode(rmode, fpst);                           \
2617         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2618             d[i] = FUNC(n[i], 0, fpst);                                 \
2619         }                                                               \
2620         set_float_rounding_mode(prev_rmode, fpst);                      \
2621         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2622     }
2623 
2624 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2625 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2626 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2627 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2628 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2629 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2630 
2631 #undef DO_VCVT_RMODE
2632 
2633 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2634     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2635     {                                                                   \
2636         intptr_t i, oprsz = simd_oprsz(desc);                           \
2637         uint32_t rmode = simd_data(desc);                               \
2638         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2639         TYPE *d = vd, *n = vn;                                          \
2640         set_float_rounding_mode(rmode, fpst);                           \
2641         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2642             d[i] = FUNC(n[i], fpst);                                    \
2643         }                                                               \
2644         set_float_rounding_mode(prev_rmode, fpst);                      \
2645         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2646     }
2647 
2648 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2649 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2650 
2651 #undef DO_VRINT_RMODE
2652 
2653 #ifdef TARGET_AARCH64
2654 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2655 {
2656     const uint8_t *indices = vm;
2657     size_t oprsz = simd_oprsz(desc);
2658     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2659     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2660     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2661     union {
2662         uint8_t b[16];
2663         uint64_t d[2];
2664     } result;
2665 
2666     /*
2667      * We must construct the final result in a temp, lest the output
2668      * overlaps the input table.  For TBL, begin with zero; for TBX,
2669      * begin with the original register contents.  Note that we always
2670      * copy 16 bytes here to avoid an extra branch; clearing the high
2671      * bits of the register for oprsz == 8 is handled below.
2672      */
2673     if (is_tbx) {
2674         memcpy(&result, vd, 16);
2675     } else {
2676         memset(&result, 0, 16);
2677     }
2678 
2679     for (size_t i = 0; i < oprsz; ++i) {
2680         uint32_t index = indices[H1(i)];
2681 
2682         if (index < table_len) {
2683             /*
2684              * Convert index (a byte offset into the virtual table
2685              * which is a series of 128-bit vectors concatenated)
2686              * into the correct register element, bearing in mind
2687              * that the table can wrap around from V31 to V0.
2688              */
2689             const uint8_t *table = (const uint8_t *)
2690                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2691             result.b[H1(i)] = table[H1(index % 16)];
2692         }
2693     }
2694 
2695     memcpy(vd, &result, 16);
2696     clear_tail(vd, oprsz, simd_maxsz(desc));
2697 }
2698 #endif
2699 
2700 /*
2701  * NxN -> N highpart multiply
2702  *
2703  * TODO: expose this as a generic vector operation.
2704  */
2705 
2706 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2707 {
2708     intptr_t i, opr_sz = simd_oprsz(desc);
2709     int8_t *d = vd, *n = vn, *m = vm;
2710 
2711     for (i = 0; i < opr_sz; ++i) {
2712         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2713     }
2714     clear_tail(d, opr_sz, simd_maxsz(desc));
2715 }
2716 
2717 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2718 {
2719     intptr_t i, opr_sz = simd_oprsz(desc);
2720     int16_t *d = vd, *n = vn, *m = vm;
2721 
2722     for (i = 0; i < opr_sz / 2; ++i) {
2723         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2724     }
2725     clear_tail(d, opr_sz, simd_maxsz(desc));
2726 }
2727 
2728 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2729 {
2730     intptr_t i, opr_sz = simd_oprsz(desc);
2731     int32_t *d = vd, *n = vn, *m = vm;
2732 
2733     for (i = 0; i < opr_sz / 4; ++i) {
2734         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2735     }
2736     clear_tail(d, opr_sz, simd_maxsz(desc));
2737 }
2738 
2739 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2740 {
2741     intptr_t i, opr_sz = simd_oprsz(desc);
2742     uint64_t *d = vd, *n = vn, *m = vm;
2743     uint64_t discard;
2744 
2745     for (i = 0; i < opr_sz / 8; ++i) {
2746         muls64(&discard, &d[i], n[i], m[i]);
2747     }
2748     clear_tail(d, opr_sz, simd_maxsz(desc));
2749 }
2750 
2751 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2752 {
2753     intptr_t i, opr_sz = simd_oprsz(desc);
2754     uint8_t *d = vd, *n = vn, *m = vm;
2755 
2756     for (i = 0; i < opr_sz; ++i) {
2757         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2758     }
2759     clear_tail(d, opr_sz, simd_maxsz(desc));
2760 }
2761 
2762 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2763 {
2764     intptr_t i, opr_sz = simd_oprsz(desc);
2765     uint16_t *d = vd, *n = vn, *m = vm;
2766 
2767     for (i = 0; i < opr_sz / 2; ++i) {
2768         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2769     }
2770     clear_tail(d, opr_sz, simd_maxsz(desc));
2771 }
2772 
2773 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2774 {
2775     intptr_t i, opr_sz = simd_oprsz(desc);
2776     uint32_t *d = vd, *n = vn, *m = vm;
2777 
2778     for (i = 0; i < opr_sz / 4; ++i) {
2779         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2780     }
2781     clear_tail(d, opr_sz, simd_maxsz(desc));
2782 }
2783 
2784 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2785 {
2786     intptr_t i, opr_sz = simd_oprsz(desc);
2787     uint64_t *d = vd, *n = vn, *m = vm;
2788     uint64_t discard;
2789 
2790     for (i = 0; i < opr_sz / 8; ++i) {
2791         mulu64(&discard, &d[i], n[i], m[i]);
2792     }
2793     clear_tail(d, opr_sz, simd_maxsz(desc));
2794 }
2795 
2796 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2797 {
2798     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2799     int shr = simd_data(desc);
2800     uint64_t *d = vd, *n = vn, *m = vm;
2801 
2802     for (i = 0; i < opr_sz; ++i) {
2803         d[i] = ror64(n[i] ^ m[i], shr);
2804     }
2805     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2806 }
2807 
2808 /*
2809  * Integer matrix-multiply accumulate
2810  */
2811 
2812 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2813 {
2814     int8_t *n = vn, *m = vm;
2815 
2816     for (intptr_t k = 0; k < 8; ++k) {
2817         sum += n[H1(k)] * m[H1(k)];
2818     }
2819     return sum;
2820 }
2821 
2822 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2823 {
2824     uint8_t *n = vn, *m = vm;
2825 
2826     for (intptr_t k = 0; k < 8; ++k) {
2827         sum += n[H1(k)] * m[H1(k)];
2828     }
2829     return sum;
2830 }
2831 
2832 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2833 {
2834     uint8_t *n = vn;
2835     int8_t *m = vm;
2836 
2837     for (intptr_t k = 0; k < 8; ++k) {
2838         sum += n[H1(k)] * m[H1(k)];
2839     }
2840     return sum;
2841 }
2842 
2843 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2844                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2845 {
2846     intptr_t seg, opr_sz = simd_oprsz(desc);
2847 
2848     for (seg = 0; seg < opr_sz; seg += 16) {
2849         uint32_t *d = vd + seg;
2850         uint32_t *a = va + seg;
2851         uint32_t sum0, sum1, sum2, sum3;
2852 
2853         /*
2854          * Process the entire segment at once, writing back the
2855          * results only after we've consumed all of the inputs.
2856          *
2857          * Key to indices by column:
2858          *          i   j                  i             j
2859          */
2860         sum0 = a[H4(0 + 0)];
2861         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2862         sum1 = a[H4(0 + 1)];
2863         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2864         sum2 = a[H4(2 + 0)];
2865         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2866         sum3 = a[H4(2 + 1)];
2867         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2868 
2869         d[H4(0)] = sum0;
2870         d[H4(1)] = sum1;
2871         d[H4(2)] = sum2;
2872         d[H4(3)] = sum3;
2873     }
2874     clear_tail(vd, opr_sz, simd_maxsz(desc));
2875 }
2876 
2877 #define DO_MMLA_B(NAME, INNER) \
2878     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2879     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2880 
2881 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2882 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2883 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
2884 
2885 /*
2886  * BFloat16 Dot Product
2887  */
2888 
2889 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
2890 {
2891     /*
2892      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
2893      * For EBF = 0, we ignore the FPCR bits which determine rounding
2894      * mode and denormal-flushing, and we do unfused multiplies and
2895      * additions with intermediate rounding of all products and sums.
2896      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
2897      * and we perform a fused two-way sum-of-products without intermediate
2898      * rounding of the products.
2899      * In either case, we don't set fp exception flags.
2900      *
2901      * EBF is AArch64 only, so even if it's set in the FPCR it has
2902      * no effect on AArch32 instructions.
2903      */
2904     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
2905 
2906     *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32;
2907     set_default_nan_mode(true, statusp);
2908 
2909     if (ebf) {
2910         /* EBF=1 needs to do a step with round-to-odd semantics */
2911         *oddstatusp = *statusp;
2912         set_float_rounding_mode(float_round_to_odd, oddstatusp);
2913     } else {
2914         set_flush_to_zero(true, statusp);
2915         set_flush_inputs_to_zero(true, statusp);
2916         set_float_rounding_mode(float_round_to_odd_inf, statusp);
2917     }
2918     return ebf;
2919 }
2920 
2921 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2922 {
2923     float32 t1, t2;
2924 
2925     /*
2926      * Extract each BFloat16 from the element pair, and shift
2927      * them such that they become float32.
2928      */
2929     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
2930     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
2931     t1 = float32_add(t1, t2, fpst);
2932     t1 = float32_add(sum, t1, fpst);
2933 
2934     return t1;
2935 }
2936 
2937 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
2938                      float_status *fpst, float_status *fpst_odd)
2939 {
2940     /*
2941      * Compare f16_dotadd() in sme_helper.c, but here we have
2942      * bfloat16 inputs. In particular that means that we do not
2943      * want the FPCR.FZ16 flush semantics, so we use the normal
2944      * float_status for the input handling here.
2945      */
2946     float64 e1r = float32_to_float64(e1 << 16, fpst);
2947     float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst);
2948     float64 e2r = float32_to_float64(e2 << 16, fpst);
2949     float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst);
2950     float64 t64;
2951     float32 t32;
2952 
2953     /*
2954      * The ARM pseudocode function FPDot performs both multiplies
2955      * and the add with a single rounding operation.  Emulate this
2956      * by performing the first multiply in round-to-odd, then doing
2957      * the second multiply as fused multiply-add, and rounding to
2958      * float32 all in one step.
2959      */
2960     t64 = float64_mul(e1r, e2r, fpst_odd);
2961     t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
2962 
2963     /* This conversion is exact, because we've already rounded. */
2964     t32 = float64_to_float32(t64, fpst);
2965 
2966     /* The final accumulation step is not fused. */
2967     return float32_add(sum, t32, fpst);
2968 }
2969 
2970 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
2971                         CPUARMState *env, uint32_t desc)
2972 {
2973     intptr_t i, opr_sz = simd_oprsz(desc);
2974     float32 *d = vd, *a = va;
2975     uint32_t *n = vn, *m = vm;
2976     float_status fpst, fpst_odd;
2977 
2978     if (is_ebf(env, &fpst, &fpst_odd)) {
2979         for (i = 0; i < opr_sz / 4; ++i) {
2980             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
2981         }
2982     } else {
2983         for (i = 0; i < opr_sz / 4; ++i) {
2984             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
2985         }
2986     }
2987     clear_tail(d, opr_sz, simd_maxsz(desc));
2988 }
2989 
2990 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2991                             void *va, CPUARMState *env, uint32_t desc)
2992 {
2993     intptr_t i, j, opr_sz = simd_oprsz(desc);
2994     intptr_t index = simd_data(desc);
2995     intptr_t elements = opr_sz / 4;
2996     intptr_t eltspersegment = MIN(16 / 4, elements);
2997     float32 *d = vd, *a = va;
2998     uint32_t *n = vn, *m = vm;
2999     float_status fpst, fpst_odd;
3000 
3001     if (is_ebf(env, &fpst, &fpst_odd)) {
3002         for (i = 0; i < elements; i += eltspersegment) {
3003             uint32_t m_idx = m[i + H4(index)];
3004 
3005             for (j = i; j < i + eltspersegment; j++) {
3006                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3007             }
3008         }
3009     } else {
3010         for (i = 0; i < elements; i += eltspersegment) {
3011             uint32_t m_idx = m[i + H4(index)];
3012 
3013             for (j = i; j < i + eltspersegment; j++) {
3014                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3015             }
3016         }
3017     }
3018     clear_tail(d, opr_sz, simd_maxsz(desc));
3019 }
3020 
3021 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3022                          CPUARMState *env, uint32_t desc)
3023 {
3024     intptr_t s, opr_sz = simd_oprsz(desc);
3025     float32 *d = vd, *a = va;
3026     uint32_t *n = vn, *m = vm;
3027     float_status fpst, fpst_odd;
3028 
3029     if (is_ebf(env, &fpst, &fpst_odd)) {
3030         for (s = 0; s < opr_sz / 4; s += 4) {
3031             float32 sum00, sum01, sum10, sum11;
3032 
3033             /*
3034              * Process the entire segment at once, writing back the
3035              * results only after we've consumed all of the inputs.
3036              *
3037              * Key to indices by column:
3038              *               i   j               i   k             j   k
3039              */
3040             sum00 = a[s + H4(0 + 0)];
3041             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3042             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3043 
3044             sum01 = a[s + H4(0 + 1)];
3045             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3046             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3047 
3048             sum10 = a[s + H4(2 + 0)];
3049             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3050             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3051 
3052             sum11 = a[s + H4(2 + 1)];
3053             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3054             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3055 
3056             d[s + H4(0 + 0)] = sum00;
3057             d[s + H4(0 + 1)] = sum01;
3058             d[s + H4(2 + 0)] = sum10;
3059             d[s + H4(2 + 1)] = sum11;
3060         }
3061     } else {
3062         for (s = 0; s < opr_sz / 4; s += 4) {
3063             float32 sum00, sum01, sum10, sum11;
3064 
3065             /*
3066              * Process the entire segment at once, writing back the
3067              * results only after we've consumed all of the inputs.
3068              *
3069              * Key to indices by column:
3070              *               i   j           i   k             j   k
3071              */
3072             sum00 = a[s + H4(0 + 0)];
3073             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3074             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3075 
3076             sum01 = a[s + H4(0 + 1)];
3077             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3078             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3079 
3080             sum10 = a[s + H4(2 + 0)];
3081             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3082             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3083 
3084             sum11 = a[s + H4(2 + 1)];
3085             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3086             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3087 
3088             d[s + H4(0 + 0)] = sum00;
3089             d[s + H4(0 + 1)] = sum01;
3090             d[s + H4(2 + 0)] = sum10;
3091             d[s + H4(2 + 1)] = sum11;
3092         }
3093     }
3094     clear_tail(d, opr_sz, simd_maxsz(desc));
3095 }
3096 
3097 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3098                          float_status *stat, uint32_t desc)
3099 {
3100     intptr_t i, opr_sz = simd_oprsz(desc);
3101     intptr_t sel = simd_data(desc);
3102     float32 *d = vd, *a = va;
3103     bfloat16 *n = vn, *m = vm;
3104 
3105     for (i = 0; i < opr_sz / 4; ++i) {
3106         float32 nn = n[H2(i * 2 + sel)] << 16;
3107         float32 mm = m[H2(i * 2 + sel)] << 16;
3108         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
3109     }
3110     clear_tail(d, opr_sz, simd_maxsz(desc));
3111 }
3112 
3113 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
3114                              void *va, float_status *stat, uint32_t desc)
3115 {
3116     intptr_t i, j, opr_sz = simd_oprsz(desc);
3117     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3118     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3119     intptr_t elements = opr_sz / 4;
3120     intptr_t eltspersegment = MIN(16 / 4, elements);
3121     float32 *d = vd, *a = va;
3122     bfloat16 *n = vn, *m = vm;
3123 
3124     for (i = 0; i < elements; i += eltspersegment) {
3125         float32 m_idx = m[H2(2 * i + index)] << 16;
3126 
3127         for (j = i; j < i + eltspersegment; j++) {
3128             float32 n_j = n[H2(2 * j + sel)] << 16;
3129             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
3130         }
3131     }
3132     clear_tail(d, opr_sz, simd_maxsz(desc));
3133 }
3134 
3135 #define DO_CLAMP(NAME, TYPE) \
3136 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3137 {                                                                       \
3138     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3139     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3140         TYPE aa = *(TYPE *)(a + i);                                     \
3141         TYPE nn = *(TYPE *)(n + i);                                     \
3142         TYPE mm = *(TYPE *)(m + i);                                     \
3143         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3144         *(TYPE *)(d + i) = dd;                                          \
3145     }                                                                   \
3146     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3147 }
3148 
3149 DO_CLAMP(gvec_sclamp_b, int8_t)
3150 DO_CLAMP(gvec_sclamp_h, int16_t)
3151 DO_CLAMP(gvec_sclamp_s, int32_t)
3152 DO_CLAMP(gvec_sclamp_d, int64_t)
3153 
3154 DO_CLAMP(gvec_uclamp_b, uint8_t)
3155 DO_CLAMP(gvec_uclamp_h, uint16_t)
3156 DO_CLAMP(gvec_uclamp_s, uint32_t)
3157 DO_CLAMP(gvec_uclamp_d, uint64_t)
3158 
3159 /* Bit count in each 8-bit word. */
3160 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3161 {
3162     intptr_t i, opr_sz = simd_oprsz(desc);
3163     uint8_t *d = vd, *n = vn;
3164 
3165     for (i = 0; i < opr_sz; ++i) {
3166         d[i] = ctpop8(n[i]);
3167     }
3168     clear_tail(d, opr_sz, simd_maxsz(desc));
3169 }
3170 
3171 /* Reverse bits in each 8 bit word */
3172 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3173 {
3174     intptr_t i, opr_sz = simd_oprsz(desc);
3175     uint64_t *d = vd, *n = vn;
3176 
3177     for (i = 0; i < opr_sz / 8; ++i) {
3178         d[i] = revbit64(bswap64(n[i]));
3179     }
3180     clear_tail(d, opr_sz, simd_maxsz(desc));
3181 }
3182 
3183 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3184 {
3185     intptr_t i, opr_sz = simd_oprsz(desc);
3186     uint32_t *d = vd, *n = vn;
3187 
3188     for (i = 0; i < opr_sz / 4; ++i) {
3189         d[i] = helper_recpe_u32(n[i]);
3190     }
3191     clear_tail(d, opr_sz, simd_maxsz(desc));
3192 }
3193 
3194 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3195 {
3196     intptr_t i, opr_sz = simd_oprsz(desc);
3197     uint32_t *d = vd, *n = vn;
3198 
3199     for (i = 0; i < opr_sz / 4; ++i) {
3200         d[i] = helper_rsqrte_u32(n[i]);
3201     }
3202     clear_tail(d, opr_sz, simd_maxsz(desc));
3203 }
3204