xref: /qemu/target/arm/tcg/vec_helper.c (revision df6fe2abf2e990f767ce755d426bc439c7bba336)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "qemu/int128.h"
26 #include "crypto/clmul.h"
27 #include "vec_internal.h"
28 
29 /*
30  * Data for expanding active predicate bits to bytes, for byte elements.
31  *
32  *  for (i = 0; i < 256; ++i) {
33  *      unsigned long m = 0;
34  *      for (j = 0; j < 8; j++) {
35  *          if ((i >> j) & 1) {
36  *              m |= 0xfful << (j << 3);
37  *          }
38  *      }
39  *      printf("0x%016lx,\n", m);
40  *  }
41  */
42 const uint64_t expand_pred_b_data[256] = {
43     0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44     0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45     0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46     0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47     0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48     0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49     0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50     0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51     0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52     0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53     0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54     0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55     0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56     0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57     0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58     0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59     0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60     0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61     0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62     0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63     0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64     0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65     0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66     0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67     0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68     0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69     0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70     0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71     0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72     0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73     0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74     0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75     0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76     0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77     0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78     0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79     0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80     0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81     0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82     0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83     0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84     0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85     0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86     0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87     0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88     0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89     0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90     0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91     0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92     0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93     0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94     0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95     0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96     0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97     0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98     0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99     0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100     0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101     0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102     0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103     0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104     0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105     0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106     0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107     0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108     0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109     0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110     0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111     0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112     0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113     0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114     0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115     0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116     0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117     0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118     0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119     0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120     0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121     0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122     0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123     0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124     0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125     0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126     0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127     0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128     0xffffffffffffffff,
129 };
130 
131 /*
132  * Similarly for half-word elements.
133  *  for (i = 0; i < 256; ++i) {
134  *      unsigned long m = 0;
135  *      if (i & 0xaa) {
136  *          continue;
137  *      }
138  *      for (j = 0; j < 8; j += 2) {
139  *          if ((i >> j) & 1) {
140  *              m |= 0xfffful << (j << 3);
141  *          }
142  *      }
143  *      printf("[0x%x] = 0x%016lx,\n", i, m);
144  *  }
145  */
146 const uint64_t expand_pred_h_data[0x55 + 1] = {
147     [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148     [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149     [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150     [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151     [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152     [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153     [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154     [0x55] = 0xffffffffffffffff,
155 };
156 
157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159                      bool neg, bool round)
160 {
161     /*
162      * Simplify:
163      * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164      * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
165      */
166     int32_t ret = (int32_t)src1 * src2;
167     if (neg) {
168         ret = -ret;
169     }
170     ret += ((int32_t)src3 << 7) + (round << 6);
171     ret >>= 7;
172 
173     if (ret != (int8_t)ret) {
174         ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175     }
176     return ret;
177 }
178 
179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180                              void *va, uint32_t desc)
181 {
182     intptr_t i, opr_sz = simd_oprsz(desc);
183     int8_t *d = vd, *n = vn, *m = vm, *a = va;
184 
185     for (i = 0; i < opr_sz; ++i) {
186         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187     }
188 }
189 
190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191                              void *va, uint32_t desc)
192 {
193     intptr_t i, opr_sz = simd_oprsz(desc);
194     int8_t *d = vd, *n = vn, *m = vm, *a = va;
195 
196     for (i = 0; i < opr_sz; ++i) {
197         d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198     }
199 }
200 
201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202 {
203     intptr_t i, opr_sz = simd_oprsz(desc);
204     int8_t *d = vd, *n = vn, *m = vm;
205 
206     for (i = 0; i < opr_sz; ++i) {
207         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208     }
209 }
210 
211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212 {
213     intptr_t i, opr_sz = simd_oprsz(desc);
214     int8_t *d = vd, *n = vn, *m = vm;
215 
216     for (i = 0; i < opr_sz; ++i) {
217         d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218     }
219 }
220 
221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223                       bool neg, bool round, uint32_t *sat)
224 {
225     /* Simplify similarly to do_sqrdmlah_b above.  */
226     int32_t ret = (int32_t)src1 * src2;
227     if (neg) {
228         ret = -ret;
229     }
230     ret += ((int32_t)src3 << 15) + (round << 14);
231     ret >>= 15;
232 
233     if (ret != (int16_t)ret) {
234         *sat = 1;
235         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
236     }
237     return ret;
238 }
239 
240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241                                   uint32_t src2, uint32_t src3)
242 {
243     uint32_t *sat = &env->vfp.qc[0];
244     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246                                 false, true, sat);
247     return deposit32(e1, 16, 16, e2);
248 }
249 
250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
251                               void *vq, uint32_t desc)
252 {
253     uintptr_t opr_sz = simd_oprsz(desc);
254     int16_t *d = vd;
255     int16_t *n = vn;
256     int16_t *m = vm;
257     uintptr_t i;
258 
259     for (i = 0; i < opr_sz / 2; ++i) {
260         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
261     }
262     clear_tail(d, opr_sz, simd_maxsz(desc));
263 }
264 
265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266                                   uint32_t src2, uint32_t src3)
267 {
268     uint32_t *sat = &env->vfp.qc[0];
269     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271                                 true, true, sat);
272     return deposit32(e1, 16, 16, e2);
273 }
274 
275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
276                               void *vq, uint32_t desc)
277 {
278     uintptr_t opr_sz = simd_oprsz(desc);
279     int16_t *d = vd;
280     int16_t *n = vn;
281     int16_t *m = vm;
282     uintptr_t i;
283 
284     for (i = 0; i < opr_sz / 2; ++i) {
285         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
286     }
287     clear_tail(d, opr_sz, simd_maxsz(desc));
288 }
289 
290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291                             void *vq, uint32_t desc)
292 {
293     intptr_t i, opr_sz = simd_oprsz(desc);
294     int16_t *d = vd, *n = vn, *m = vm;
295 
296     for (i = 0; i < opr_sz / 2; ++i) {
297         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298     }
299     clear_tail(d, opr_sz, simd_maxsz(desc));
300 }
301 
302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303                              void *vq, uint32_t desc)
304 {
305     intptr_t i, opr_sz = simd_oprsz(desc);
306     int16_t *d = vd, *n = vn, *m = vm;
307 
308     for (i = 0; i < opr_sz / 2; ++i) {
309         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310     }
311     clear_tail(d, opr_sz, simd_maxsz(desc));
312 }
313 
314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
315                                 void *vq, uint32_t desc)
316 {
317     intptr_t i, j, opr_sz = simd_oprsz(desc);
318     int idx = simd_data(desc);
319     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
320     intptr_t elements = opr_sz / 2;
321     intptr_t eltspersegment = MIN(16 / 2, elements);
322 
323     for (i = 0; i < elements; i += 16 / 2) {
324         int16_t mm = m[i];
325         for (j = 0; j < eltspersegment; ++j) {
326             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
327         }
328     }
329     clear_tail(d, opr_sz, simd_maxsz(desc));
330 }
331 
332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
333                                  void *vq, uint32_t desc)
334 {
335     intptr_t i, j, opr_sz = simd_oprsz(desc);
336     int idx = simd_data(desc);
337     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
338     intptr_t elements = opr_sz / 2;
339     intptr_t eltspersegment = MIN(16 / 2, elements);
340 
341     for (i = 0; i < elements; i += 16 / 2) {
342         int16_t mm = m[i];
343         for (j = 0; j < eltspersegment; ++j) {
344             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
345         }
346     }
347     clear_tail(d, opr_sz, simd_maxsz(desc));
348 }
349 
350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
351                                  void *vq, uint32_t desc)
352 {
353     intptr_t i, j, opr_sz = simd_oprsz(desc);
354     int idx = simd_data(desc);
355     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
356     intptr_t elements = opr_sz / 2;
357     intptr_t eltspersegment = MIN(16 / 2, elements);
358 
359     for (i = 0; i < elements; i += 16 / 2) {
360         int16_t mm = m[i];
361         for (j = 0; j < eltspersegment; ++j) {
362             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
363         }
364     }
365     clear_tail(d, opr_sz, simd_maxsz(desc));
366 }
367 
368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
369                                  void *vq, uint32_t desc)
370 {
371     intptr_t i, j, opr_sz = simd_oprsz(desc);
372     int idx = simd_data(desc);
373     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
374     intptr_t elements = opr_sz / 2;
375     intptr_t eltspersegment = MIN(16 / 2, elements);
376 
377     for (i = 0; i < elements; i += 16 / 2) {
378         int16_t mm = m[i];
379         for (j = 0; j < eltspersegment; ++j) {
380             d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
381         }
382     }
383     clear_tail(d, opr_sz, simd_maxsz(desc));
384 }
385 
386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
387                              void *va, uint32_t desc)
388 {
389     intptr_t i, opr_sz = simd_oprsz(desc);
390     int16_t *d = vd, *n = vn, *m = vm, *a = va;
391     uint32_t discard;
392 
393     for (i = 0; i < opr_sz / 2; ++i) {
394         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
395     }
396 }
397 
398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
399                              void *va, uint32_t desc)
400 {
401     intptr_t i, opr_sz = simd_oprsz(desc);
402     int16_t *d = vd, *n = vn, *m = vm, *a = va;
403     uint32_t discard;
404 
405     for (i = 0; i < opr_sz / 2; ++i) {
406         d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
407     }
408 }
409 
410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
411 {
412     intptr_t i, opr_sz = simd_oprsz(desc);
413     int16_t *d = vd, *n = vn, *m = vm;
414     uint32_t discard;
415 
416     for (i = 0; i < opr_sz / 2; ++i) {
417         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
418     }
419 }
420 
421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
422 {
423     intptr_t i, opr_sz = simd_oprsz(desc);
424     int16_t *d = vd, *n = vn, *m = vm;
425     uint32_t discard;
426 
427     for (i = 0; i < opr_sz / 2; ++i) {
428         d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
429     }
430 }
431 
432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
433 {
434     intptr_t i, j, opr_sz = simd_oprsz(desc);
435     int idx = simd_data(desc);
436     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
437     uint32_t discard;
438 
439     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
440         int16_t mm = m[i];
441         for (j = 0; j < 16 / 2; ++j) {
442             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
443         }
444     }
445 }
446 
447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
448 {
449     intptr_t i, j, opr_sz = simd_oprsz(desc);
450     int idx = simd_data(desc);
451     int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
452     uint32_t discard;
453 
454     for (i = 0; i < opr_sz / 2; i += 16 / 2) {
455         int16_t mm = m[i];
456         for (j = 0; j < 16 / 2; ++j) {
457             d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
458         }
459     }
460 }
461 
462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
464                       bool neg, bool round, uint32_t *sat)
465 {
466     /* Simplify similarly to do_sqrdmlah_b above.  */
467     int64_t ret = (int64_t)src1 * src2;
468     if (neg) {
469         ret = -ret;
470     }
471     ret += ((int64_t)src3 << 31) + (round << 30);
472     ret >>= 31;
473 
474     if (ret != (int32_t)ret) {
475         *sat = 1;
476         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
477     }
478     return ret;
479 }
480 
481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
482                                   int32_t src2, int32_t src3)
483 {
484     uint32_t *sat = &env->vfp.qc[0];
485     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
486 }
487 
488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
489                               void *vq, uint32_t desc)
490 {
491     uintptr_t opr_sz = simd_oprsz(desc);
492     int32_t *d = vd;
493     int32_t *n = vn;
494     int32_t *m = vm;
495     uintptr_t i;
496 
497     for (i = 0; i < opr_sz / 4; ++i) {
498         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
499     }
500     clear_tail(d, opr_sz, simd_maxsz(desc));
501 }
502 
503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
504                                   int32_t src2, int32_t src3)
505 {
506     uint32_t *sat = &env->vfp.qc[0];
507     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
508 }
509 
510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
511                               void *vq, uint32_t desc)
512 {
513     uintptr_t opr_sz = simd_oprsz(desc);
514     int32_t *d = vd;
515     int32_t *n = vn;
516     int32_t *m = vm;
517     uintptr_t i;
518 
519     for (i = 0; i < opr_sz / 4; ++i) {
520         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
521     }
522     clear_tail(d, opr_sz, simd_maxsz(desc));
523 }
524 
525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
526                             void *vq, uint32_t desc)
527 {
528     intptr_t i, opr_sz = simd_oprsz(desc);
529     int32_t *d = vd, *n = vn, *m = vm;
530 
531     for (i = 0; i < opr_sz / 4; ++i) {
532         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
533     }
534     clear_tail(d, opr_sz, simd_maxsz(desc));
535 }
536 
537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
538                              void *vq, uint32_t desc)
539 {
540     intptr_t i, opr_sz = simd_oprsz(desc);
541     int32_t *d = vd, *n = vn, *m = vm;
542 
543     for (i = 0; i < opr_sz / 4; ++i) {
544         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
545     }
546     clear_tail(d, opr_sz, simd_maxsz(desc));
547 }
548 
549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
550                                 void *vq, uint32_t desc)
551 {
552     intptr_t i, j, opr_sz = simd_oprsz(desc);
553     int idx = simd_data(desc);
554     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
555     intptr_t elements = opr_sz / 4;
556     intptr_t eltspersegment = MIN(16 / 4, elements);
557 
558     for (i = 0; i < elements; i += 16 / 4) {
559         int32_t mm = m[i];
560         for (j = 0; j < eltspersegment; ++j) {
561             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
562         }
563     }
564     clear_tail(d, opr_sz, simd_maxsz(desc));
565 }
566 
567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
568                                  void *vq, uint32_t desc)
569 {
570     intptr_t i, j, opr_sz = simd_oprsz(desc);
571     int idx = simd_data(desc);
572     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
573     intptr_t elements = opr_sz / 4;
574     intptr_t eltspersegment = MIN(16 / 4, elements);
575 
576     for (i = 0; i < elements; i += 16 / 4) {
577         int32_t mm = m[i];
578         for (j = 0; j < eltspersegment; ++j) {
579             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
580         }
581     }
582     clear_tail(d, opr_sz, simd_maxsz(desc));
583 }
584 
585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
586                                  void *vq, uint32_t desc)
587 {
588     intptr_t i, j, opr_sz = simd_oprsz(desc);
589     int idx = simd_data(desc);
590     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
591     intptr_t elements = opr_sz / 4;
592     intptr_t eltspersegment = MIN(16 / 4, elements);
593 
594     for (i = 0; i < elements; i += 16 / 4) {
595         int32_t mm = m[i];
596         for (j = 0; j < eltspersegment; ++j) {
597             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
598         }
599     }
600     clear_tail(d, opr_sz, simd_maxsz(desc));
601 }
602 
603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
604                                  void *vq, uint32_t desc)
605 {
606     intptr_t i, j, opr_sz = simd_oprsz(desc);
607     int idx = simd_data(desc);
608     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
609     intptr_t elements = opr_sz / 4;
610     intptr_t eltspersegment = MIN(16 / 4, elements);
611 
612     for (i = 0; i < elements; i += 16 / 4) {
613         int32_t mm = m[i];
614         for (j = 0; j < eltspersegment; ++j) {
615             d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
616         }
617     }
618     clear_tail(d, opr_sz, simd_maxsz(desc));
619 }
620 
621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
622                              void *va, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625     int32_t *d = vd, *n = vn, *m = vm, *a = va;
626     uint32_t discard;
627 
628     for (i = 0; i < opr_sz / 4; ++i) {
629         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
630     }
631 }
632 
633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
634                              void *va, uint32_t desc)
635 {
636     intptr_t i, opr_sz = simd_oprsz(desc);
637     int32_t *d = vd, *n = vn, *m = vm, *a = va;
638     uint32_t discard;
639 
640     for (i = 0; i < opr_sz / 4; ++i) {
641         d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
642     }
643 }
644 
645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
646 {
647     intptr_t i, opr_sz = simd_oprsz(desc);
648     int32_t *d = vd, *n = vn, *m = vm;
649     uint32_t discard;
650 
651     for (i = 0; i < opr_sz / 4; ++i) {
652         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
653     }
654 }
655 
656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
657 {
658     intptr_t i, opr_sz = simd_oprsz(desc);
659     int32_t *d = vd, *n = vn, *m = vm;
660     uint32_t discard;
661 
662     for (i = 0; i < opr_sz / 4; ++i) {
663         d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
664     }
665 }
666 
667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
668 {
669     intptr_t i, j, opr_sz = simd_oprsz(desc);
670     int idx = simd_data(desc);
671     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
672     uint32_t discard;
673 
674     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
675         int32_t mm = m[i];
676         for (j = 0; j < 16 / 4; ++j) {
677             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
678         }
679     }
680 }
681 
682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
683 {
684     intptr_t i, j, opr_sz = simd_oprsz(desc);
685     int idx = simd_data(desc);
686     int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
687     uint32_t discard;
688 
689     for (i = 0; i < opr_sz / 4; i += 16 / 4) {
690         int32_t mm = m[i];
691         for (j = 0; j < 16 / 4; ++j) {
692             d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
693         }
694     }
695 }
696 
697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
698 static int64_t do_sat128_d(Int128 r)
699 {
700     int64_t ls = int128_getlo(r);
701     int64_t hs = int128_gethi(r);
702 
703     if (unlikely(hs != (ls >> 63))) {
704         return hs < 0 ? INT64_MIN : INT64_MAX;
705     }
706     return ls;
707 }
708 
709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
710 {
711     uint64_t l, h;
712     Int128 r, t;
713 
714     /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
715     muls64(&l, &h, m, n);
716     r = int128_make128(l, h);
717     if (neg) {
718         r = int128_neg(r);
719     }
720     if (a) {
721         t = int128_exts64(a);
722         t = int128_lshift(t, 63);
723         r = int128_add(r, t);
724     }
725     if (round) {
726         t = int128_exts64(1ll << 62);
727         r = int128_add(r, t);
728     }
729     r = int128_rshift(r, 63);
730 
731     return do_sat128_d(r);
732 }
733 
734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
735                              void *va, uint32_t desc)
736 {
737     intptr_t i, opr_sz = simd_oprsz(desc);
738     int64_t *d = vd, *n = vn, *m = vm, *a = va;
739 
740     for (i = 0; i < opr_sz / 8; ++i) {
741         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
742     }
743 }
744 
745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
746                              void *va, uint32_t desc)
747 {
748     intptr_t i, opr_sz = simd_oprsz(desc);
749     int64_t *d = vd, *n = vn, *m = vm, *a = va;
750 
751     for (i = 0; i < opr_sz / 8; ++i) {
752         d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
753     }
754 }
755 
756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
757 {
758     intptr_t i, opr_sz = simd_oprsz(desc);
759     int64_t *d = vd, *n = vn, *m = vm;
760 
761     for (i = 0; i < opr_sz / 8; ++i) {
762         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
763     }
764 }
765 
766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
767 {
768     intptr_t i, opr_sz = simd_oprsz(desc);
769     int64_t *d = vd, *n = vn, *m = vm;
770 
771     for (i = 0; i < opr_sz / 8; ++i) {
772         d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
773     }
774 }
775 
776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
777 {
778     intptr_t i, j, opr_sz = simd_oprsz(desc);
779     int idx = simd_data(desc);
780     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
781 
782     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
783         int64_t mm = m[i];
784         for (j = 0; j < 16 / 8; ++j) {
785             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
786         }
787     }
788 }
789 
790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
791 {
792     intptr_t i, j, opr_sz = simd_oprsz(desc);
793     int idx = simd_data(desc);
794     int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
795 
796     for (i = 0; i < opr_sz / 8; i += 16 / 8) {
797         int64_t mm = m[i];
798         for (j = 0; j < 16 / 8; ++j) {
799             d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
800         }
801     }
802 }
803 
804 /* Integer 8 and 16-bit dot-product.
805  *
806  * Note that for the loops herein, host endianness does not matter
807  * with respect to the ordering of data within the quad-width lanes.
808  * All elements are treated equally, no matter where they are.
809  */
810 
811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
813 {                                                                         \
814     intptr_t i, opr_sz = simd_oprsz(desc);                                \
815     TYPED *d = vd, *a = va;                                               \
816     TYPEN *n = vn;                                                        \
817     TYPEM *m = vm;                                                        \
818     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
819         d[i] = (a[i] +                                                    \
820                 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] +                      \
821                 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] +                      \
822                 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] +                      \
823                 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]);                      \
824     }                                                                     \
825     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
826 }
827 
828 DO_DOT(gvec_sdot_4b, int32_t, int8_t, int8_t)
829 DO_DOT(gvec_udot_4b, uint32_t, uint8_t, uint8_t)
830 DO_DOT(gvec_usdot_4b, uint32_t, uint8_t, int8_t)
831 DO_DOT(gvec_sdot_4h, int64_t, int16_t, int16_t)
832 DO_DOT(gvec_udot_4h, uint64_t, uint16_t, uint16_t)
833 
834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
836 {                                                                         \
837     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
838     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
839     /*                                                                    \
840      * Special case: opr_sz == 8 from AA64/AA32 advsimd means the         \
841      * first iteration might not be a full 16 byte segment. But           \
842      * for vector lengths beyond that this must be SVE and we know        \
843      * opr_sz is a multiple of 16, so we need not clamp segend            \
844      * to opr_sz_n when we advance it at the end of the loop.             \
845      */                                                                   \
846     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
847     intptr_t index = simd_data(desc);                                     \
848     TYPED *d = vd, *a = va;                                               \
849     TYPEN *n = vn;                                                        \
850     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4;                       \
851     do {                                                                  \
852         TYPED m0 = m_indexed[i * 4 + 0];                                  \
853         TYPED m1 = m_indexed[i * 4 + 1];                                  \
854         TYPED m2 = m_indexed[i * 4 + 2];                                  \
855         TYPED m3 = m_indexed[i * 4 + 3];                                  \
856         do {                                                              \
857             d[i] = (a[i] +                                                \
858                     n[i * 4 + 0] * m0 +                                   \
859                     n[i * 4 + 1] * m1 +                                   \
860                     n[i * 4 + 2] * m2 +                                   \
861                     n[i * 4 + 3] * m3);                                   \
862         } while (++i < segend);                                           \
863         segend = i + (16 / sizeof(TYPED));                                \
864     } while (i < opr_sz_n);                                               \
865     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
866 }
867 
868 DO_DOT_IDX(gvec_sdot_idx_4b, int32_t, int8_t, int8_t, H4)
869 DO_DOT_IDX(gvec_udot_idx_4b, uint32_t, uint8_t, uint8_t, H4)
870 DO_DOT_IDX(gvec_sudot_idx_4b, int32_t, int8_t, uint8_t, H4)
871 DO_DOT_IDX(gvec_usdot_idx_4b, int32_t, uint8_t, int8_t, H4)
872 DO_DOT_IDX(gvec_sdot_idx_4h, int64_t, int16_t, int16_t, H8)
873 DO_DOT_IDX(gvec_udot_idx_4h, uint64_t, uint16_t, uint16_t, H8)
874 
875 #undef DO_DOT
876 #undef DO_DOT_IDX
877 
878 /* Similar for 2-way dot product */
879 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
880 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
881 {                                                                         \
882     intptr_t i, opr_sz = simd_oprsz(desc);                                \
883     TYPED *d = vd, *a = va;                                               \
884     TYPEN *n = vn;                                                        \
885     TYPEM *m = vm;                                                        \
886     for (i = 0; i < opr_sz / sizeof(TYPED); ++i) {                        \
887         d[i] = (a[i] +                                                    \
888                 (TYPED)n[i * 2 + 0] * m[i * 2 + 0] +                      \
889                 (TYPED)n[i * 2 + 1] * m[i * 2 + 1]);                      \
890     }                                                                     \
891     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
892 }
893 
894 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
895 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
896 {                                                                         \
897     intptr_t i = 0, opr_sz = simd_oprsz(desc);                            \
898     intptr_t opr_sz_n = opr_sz / sizeof(TYPED);                           \
899     intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n);                  \
900     intptr_t index = simd_data(desc);                                     \
901     TYPED *d = vd, *a = va;                                               \
902     TYPEN *n = vn;                                                        \
903     TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 2;                       \
904     do {                                                                  \
905         TYPED m0 = m_indexed[i * 2 + 0];                                  \
906         TYPED m1 = m_indexed[i * 2 + 1];                                  \
907         do {                                                              \
908             d[i] = (a[i] +                                                \
909                     n[i * 2 + 0] * m0 +                                   \
910                     n[i * 2 + 1] * m1);                                   \
911         } while (++i < segend);                                           \
912         segend = i + (16 / sizeof(TYPED));                                \
913     } while (i < opr_sz_n);                                               \
914     clear_tail(d, opr_sz, simd_maxsz(desc));                              \
915 }
916 
917 DO_DOT(gvec_sdot_2h, int32_t, int16_t, int16_t)
918 DO_DOT(gvec_udot_2h, uint32_t, uint16_t, uint16_t)
919 
920 DO_DOT_IDX(gvec_sdot_idx_2h, int32_t, int16_t, int16_t, H4)
921 DO_DOT_IDX(gvec_udot_idx_2h, uint32_t, uint16_t, uint16_t, H4)
922 
923 #undef DO_DOT
924 #undef DO_DOT_IDX
925 
926 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
927                          float_status *fpst, uint32_t desc)
928 {
929     uintptr_t opr_sz = simd_oprsz(desc);
930     float16 *d = vd;
931     float16 *n = vn;
932     float16 *m = vm;
933     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
934     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
935     uintptr_t i;
936 
937     for (i = 0; i < opr_sz / 2; i += 2) {
938         float16 e0 = n[H2(i)];
939         float16 e1 = m[H2(i + 1)];
940         float16 e2 = n[H2(i + 1)];
941         float16 e3 = m[H2(i)];
942 
943         if (rot) {
944             e3 = float16_maybe_ah_chs(e3, fpcr_ah);
945         } else {
946             e1 = float16_maybe_ah_chs(e1, fpcr_ah);
947         }
948 
949         d[H2(i)] = float16_add(e0, e1, fpst);
950         d[H2(i + 1)] = float16_add(e2, e3, fpst);
951     }
952     clear_tail(d, opr_sz, simd_maxsz(desc));
953 }
954 
955 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
956                          float_status *fpst, uint32_t desc)
957 {
958     uintptr_t opr_sz = simd_oprsz(desc);
959     float32 *d = vd;
960     float32 *n = vn;
961     float32 *m = vm;
962     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
963     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
964     uintptr_t i;
965 
966     for (i = 0; i < opr_sz / 4; i += 2) {
967         float32 e0 = n[H4(i)];
968         float32 e1 = m[H4(i + 1)];
969         float32 e2 = n[H4(i + 1)];
970         float32 e3 = m[H4(i)];
971 
972         if (rot) {
973             e3 = float32_maybe_ah_chs(e3, fpcr_ah);
974         } else {
975             e1 = float32_maybe_ah_chs(e1, fpcr_ah);
976         }
977 
978         d[H4(i)] = float32_add(e0, e1, fpst);
979         d[H4(i + 1)] = float32_add(e2, e3, fpst);
980     }
981     clear_tail(d, opr_sz, simd_maxsz(desc));
982 }
983 
984 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
985                          float_status *fpst, uint32_t desc)
986 {
987     uintptr_t opr_sz = simd_oprsz(desc);
988     float64 *d = vd;
989     float64 *n = vn;
990     float64 *m = vm;
991     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
992     bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1);
993     uintptr_t i;
994 
995     for (i = 0; i < opr_sz / 8; i += 2) {
996         float64 e0 = n[i];
997         float64 e1 = m[i + 1];
998         float64 e2 = n[i + 1];
999         float64 e3 = m[i];
1000 
1001         if (rot) {
1002             e3 = float64_maybe_ah_chs(e3, fpcr_ah);
1003         } else {
1004             e1 = float64_maybe_ah_chs(e1, fpcr_ah);
1005         }
1006 
1007         d[i] = float64_add(e0, e1, fpst);
1008         d[i + 1] = float64_add(e2, e3, fpst);
1009     }
1010     clear_tail(d, opr_sz, simd_maxsz(desc));
1011 }
1012 
1013 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
1014                          float_status *fpst, uint32_t desc)
1015 {
1016     uintptr_t opr_sz = simd_oprsz(desc);
1017     float16 *d = vd, *n = vn, *m = vm, *a = va;
1018     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1019     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1020     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1021     uint32_t negf_real = flip ^ negf_imag;
1022     float16 negx_imag, negx_real;
1023     uintptr_t i;
1024 
1025     /* With AH=0, use negx; with AH=1 use negf. */
1026     negx_real = (negf_real & ~fpcr_ah) << 15;
1027     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1028     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1029     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1030 
1031     for (i = 0; i < opr_sz / 2; i += 2) {
1032         float16 e2 = n[H2(i + flip)];
1033         float16 e1 = m[H2(i + flip)] ^ negx_real;
1034         float16 e4 = e2;
1035         float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag;
1036 
1037         d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst);
1038         d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst);
1039     }
1040     clear_tail(d, opr_sz, simd_maxsz(desc));
1041 }
1042 
1043 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
1044                              float_status *fpst, uint32_t desc)
1045 {
1046     uintptr_t opr_sz = simd_oprsz(desc);
1047     float16 *d = vd, *n = vn, *m = vm, *a = va;
1048     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1049     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1050     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1051     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1052     uint32_t negf_real = flip ^ negf_imag;
1053     intptr_t elements = opr_sz / sizeof(float16);
1054     intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
1055     float16 negx_imag, negx_real;
1056     intptr_t i, j;
1057 
1058     /* With AH=0, use negx; with AH=1 use negf. */
1059     negx_real = (negf_real & ~fpcr_ah) << 15;
1060     negx_imag = (negf_imag & ~fpcr_ah) << 15;
1061     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1062     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1063 
1064     for (i = 0; i < elements; i += eltspersegment) {
1065         float16 mr = m[H2(i + 2 * index + 0)];
1066         float16 mi = m[H2(i + 2 * index + 1)];
1067         float16 e1 = negx_real ^ (flip ? mi : mr);
1068         float16 e3 = negx_imag ^ (flip ? mr : mi);
1069 
1070         for (j = i; j < i + eltspersegment; j += 2) {
1071             float16 e2 = n[H2(j + flip)];
1072             float16 e4 = e2;
1073 
1074             d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst);
1075             d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst);
1076         }
1077     }
1078     clear_tail(d, opr_sz, simd_maxsz(desc));
1079 }
1080 
1081 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
1082                          float_status *fpst, uint32_t desc)
1083 {
1084     uintptr_t opr_sz = simd_oprsz(desc);
1085     float32 *d = vd, *n = vn, *m = vm, *a = va;
1086     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1087     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1088     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1089     uint32_t negf_real = flip ^ negf_imag;
1090     float32 negx_imag, negx_real;
1091     uintptr_t i;
1092 
1093     /* With AH=0, use negx; with AH=1 use negf. */
1094     negx_real = (negf_real & ~fpcr_ah) << 31;
1095     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1096     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1097     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1098 
1099     for (i = 0; i < opr_sz / 4; i += 2) {
1100         float32 e2 = n[H4(i + flip)];
1101         float32 e1 = m[H4(i + flip)] ^ negx_real;
1102         float32 e4 = e2;
1103         float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag;
1104 
1105         d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst);
1106         d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst);
1107     }
1108     clear_tail(d, opr_sz, simd_maxsz(desc));
1109 }
1110 
1111 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
1112                              float_status *fpst, uint32_t desc)
1113 {
1114     uintptr_t opr_sz = simd_oprsz(desc);
1115     float32 *d = vd, *n = vn, *m = vm, *a = va;
1116     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1117     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1118     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1119     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1);
1120     uint32_t negf_real = flip ^ negf_imag;
1121     intptr_t elements = opr_sz / sizeof(float32);
1122     intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
1123     float32 negx_imag, negx_real;
1124     intptr_t i, j;
1125 
1126     /* With AH=0, use negx; with AH=1 use negf. */
1127     negx_real = (negf_real & ~fpcr_ah) << 31;
1128     negx_imag = (negf_imag & ~fpcr_ah) << 31;
1129     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1130     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1131 
1132     for (i = 0; i < elements; i += eltspersegment) {
1133         float32 mr = m[H4(i + 2 * index + 0)];
1134         float32 mi = m[H4(i + 2 * index + 1)];
1135         float32 e1 = negx_real ^ (flip ? mi : mr);
1136         float32 e3 = negx_imag ^ (flip ? mr : mi);
1137 
1138         for (j = i; j < i + eltspersegment; j += 2) {
1139             float32 e2 = n[H4(j + flip)];
1140             float32 e4 = e2;
1141 
1142             d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst);
1143             d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst);
1144         }
1145     }
1146     clear_tail(d, opr_sz, simd_maxsz(desc));
1147 }
1148 
1149 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
1150                          float_status *fpst, uint32_t desc)
1151 {
1152     uintptr_t opr_sz = simd_oprsz(desc);
1153     float64 *d = vd, *n = vn, *m = vm, *a = va;
1154     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
1155     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
1156     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1157     uint32_t negf_real = flip ^ negf_imag;
1158     float64 negx_real, negx_imag;
1159     uintptr_t i;
1160 
1161     /* With AH=0, use negx; with AH=1 use negf. */
1162     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
1163     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
1164     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
1165     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
1166 
1167     for (i = 0; i < opr_sz / 8; i += 2) {
1168         float64 e2 = n[i + flip];
1169         float64 e1 = m[i + flip] ^ negx_real;
1170         float64 e4 = e2;
1171         float64 e3 = m[i + 1 - flip] ^ negx_imag;
1172 
1173         d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst);
1174         d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst);
1175     }
1176     clear_tail(d, opr_sz, simd_maxsz(desc));
1177 }
1178 
1179 /*
1180  * Floating point comparisons producing an integer result (all 1s or all 0s).
1181  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1182  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1183  */
1184 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
1185 {
1186     return -float16_eq_quiet(op1, op2, stat);
1187 }
1188 
1189 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
1190 {
1191     return -float32_eq_quiet(op1, op2, stat);
1192 }
1193 
1194 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat)
1195 {
1196     return -float64_eq_quiet(op1, op2, stat);
1197 }
1198 
1199 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
1200 {
1201     return -float16_le(op2, op1, stat);
1202 }
1203 
1204 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
1205 {
1206     return -float32_le(op2, op1, stat);
1207 }
1208 
1209 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat)
1210 {
1211     return -float64_le(op2, op1, stat);
1212 }
1213 
1214 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
1215 {
1216     return -float16_lt(op2, op1, stat);
1217 }
1218 
1219 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
1220 {
1221     return -float32_lt(op2, op1, stat);
1222 }
1223 
1224 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat)
1225 {
1226     return -float64_lt(op2, op1, stat);
1227 }
1228 
1229 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
1230 {
1231     return -float16_le(float16_abs(op2), float16_abs(op1), stat);
1232 }
1233 
1234 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1235 {
1236     return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1237 }
1238 
1239 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat)
1240 {
1241     return -float64_le(float64_abs(op2), float64_abs(op1), stat);
1242 }
1243 
1244 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1245 {
1246     return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1247 }
1248 
1249 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1250 {
1251     return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1252 }
1253 
1254 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat)
1255 {
1256     return -float64_lt(float64_abs(op2), float64_abs(op1), stat);
1257 }
1258 
1259 static int16_t vfp_tosszh(float16 x, float_status *fpst)
1260 {
1261     if (float16_is_any_nan(x)) {
1262         float_raise(float_flag_invalid, fpst);
1263         return 0;
1264     }
1265     return float16_to_int16_round_to_zero(x, fpst);
1266 }
1267 
1268 static uint16_t vfp_touszh(float16 x, float_status *fpst)
1269 {
1270     if (float16_is_any_nan(x)) {
1271         float_raise(float_flag_invalid, fpst);
1272         return 0;
1273     }
1274     return float16_to_uint16_round_to_zero(x, fpst);
1275 }
1276 
1277 #define DO_2OP(NAME, FUNC, TYPE) \
1278 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc)  \
1279 {                                                                 \
1280     intptr_t i, oprsz = simd_oprsz(desc);                         \
1281     TYPE *d = vd, *n = vn;                                        \
1282     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
1283         d[i] = FUNC(n[i], stat);                                  \
1284     }                                                             \
1285     clear_tail(d, oprsz, simd_maxsz(desc));                       \
1286 }
1287 
1288 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1289 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1290 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32)
1291 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1292 
1293 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1294 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1295 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32)
1296 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1297 
1298 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1299 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1300 
1301 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1302 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1303 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1304 DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1305 DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1306 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1307 DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1308 DO_2OP(gvec_touszh, vfp_touszh, float16)
1309 
1310 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE)                          \
1311     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)     \
1312     {                                                           \
1313         return TYPE##_##CMPOP(op, TYPE##_zero, stat);           \
1314     }
1315 
1316 #define WRAP_CMP0_REV(FN, CMPOP, TYPE)                          \
1317     static TYPE TYPE##_##FN##0(TYPE op, float_status *stat)    \
1318     {                                                           \
1319         return TYPE##_##CMPOP(TYPE##_zero, op, stat);           \
1320     }
1321 
1322 #define DO_2OP_CMP0(FN, CMPOP, DIRN)                    \
1323     WRAP_CMP0_##DIRN(FN, CMPOP, float16)                \
1324     WRAP_CMP0_##DIRN(FN, CMPOP, float32)                \
1325     WRAP_CMP0_##DIRN(FN, CMPOP, float64)                \
1326     DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16)   \
1327     DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)   \
1328     DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64)
1329 
1330 DO_2OP_CMP0(cgt, cgt, FWD)
1331 DO_2OP_CMP0(cge, cge, FWD)
1332 DO_2OP_CMP0(ceq, ceq, FWD)
1333 DO_2OP_CMP0(clt, cgt, REV)
1334 DO_2OP_CMP0(cle, cge, REV)
1335 
1336 #undef DO_2OP
1337 #undef DO_2OP_CMP0
1338 
1339 /* Floating-point trigonometric starting value.
1340  * See the ARM ARM pseudocode function FPTrigSMul.
1341  */
1342 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1343 {
1344     float16 result = float16_mul(op1, op1, stat);
1345     if (!float16_is_any_nan(result)) {
1346         result = float16_set_sign(result, op2 & 1);
1347     }
1348     return result;
1349 }
1350 
1351 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1352 {
1353     float32 result = float32_mul(op1, op1, stat);
1354     if (!float32_is_any_nan(result)) {
1355         result = float32_set_sign(result, op2 & 1);
1356     }
1357     return result;
1358 }
1359 
1360 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1361 {
1362     float64 result = float64_mul(op1, op1, stat);
1363     if (!float64_is_any_nan(result)) {
1364         result = float64_set_sign(result, op2 & 1);
1365     }
1366     return result;
1367 }
1368 
1369 static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1370 {
1371     return float16_abs(float16_sub(op1, op2, stat));
1372 }
1373 
1374 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1375 {
1376     return float32_abs(float32_sub(op1, op2, stat));
1377 }
1378 
1379 static float64 float64_abd(float64 op1, float64 op2, float_status *stat)
1380 {
1381     return float64_abs(float64_sub(op1, op2, stat));
1382 }
1383 
1384 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
1385 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat)
1386 {
1387     float16 r = float16_sub(op1, op2, stat);
1388     return float16_is_any_nan(r) ? r : float16_abs(r);
1389 }
1390 
1391 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat)
1392 {
1393     float32 r = float32_sub(op1, op2, stat);
1394     return float32_is_any_nan(r) ? r : float32_abs(r);
1395 }
1396 
1397 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat)
1398 {
1399     float64 r = float64_sub(op1, op2, stat);
1400     return float64_is_any_nan(r) ? r : float64_abs(r);
1401 }
1402 
1403 /*
1404  * Reciprocal step. These are the AArch32 version which uses a
1405  * non-fused multiply-and-subtract.
1406  */
1407 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1408 {
1409     op1 = float16_squash_input_denormal(op1, stat);
1410     op2 = float16_squash_input_denormal(op2, stat);
1411 
1412     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1413         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1414         return float16_two;
1415     }
1416     return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1417 }
1418 
1419 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1420 {
1421     op1 = float32_squash_input_denormal(op1, stat);
1422     op2 = float32_squash_input_denormal(op2, stat);
1423 
1424     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1425         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1426         return float32_two;
1427     }
1428     return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1429 }
1430 
1431 /* Reciprocal square-root step. AArch32 non-fused semantics. */
1432 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1433 {
1434     op1 = float16_squash_input_denormal(op1, stat);
1435     op2 = float16_squash_input_denormal(op2, stat);
1436 
1437     if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1438         (float16_is_infinity(op2) && float16_is_zero(op1))) {
1439         return float16_one_point_five;
1440     }
1441     op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1442     return float16_div(op1, float16_two, stat);
1443 }
1444 
1445 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1446 {
1447     op1 = float32_squash_input_denormal(op1, stat);
1448     op2 = float32_squash_input_denormal(op2, stat);
1449 
1450     if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1451         (float32_is_infinity(op2) && float32_is_zero(op1))) {
1452         return float32_one_point_five;
1453     }
1454     op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1455     return float32_div(op1, float32_two, stat);
1456 }
1457 
1458 #define DO_3OP(NAME, FUNC, TYPE) \
1459 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1460                   float_status *stat, uint32_t desc)                       \
1461 {                                                                          \
1462     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1463     TYPE *d = vd, *n = vn, *m = vm;                                        \
1464     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1465         d[i] = FUNC(n[i], m[i], stat);                                     \
1466     }                                                                      \
1467     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1468 }
1469 
1470 DO_3OP(gvec_fadd_h, float16_add, float16)
1471 DO_3OP(gvec_fadd_s, float32_add, float32)
1472 DO_3OP(gvec_fadd_d, float64_add, float64)
1473 DO_3OP(gvec_bfadd, bfloat16_add, bfloat16)
1474 
1475 DO_3OP(gvec_fsub_h, float16_sub, float16)
1476 DO_3OP(gvec_fsub_s, float32_sub, float32)
1477 DO_3OP(gvec_fsub_d, float64_sub, float64)
1478 DO_3OP(gvec_bfsub, bfloat16_sub, bfloat16)
1479 
1480 DO_3OP(gvec_fmul_h, float16_mul, float16)
1481 DO_3OP(gvec_fmul_s, float32_mul, float32)
1482 DO_3OP(gvec_fmul_d, float64_mul, float64)
1483 
1484 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1485 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1486 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1487 
1488 DO_3OP(gvec_fabd_h, float16_abd, float16)
1489 DO_3OP(gvec_fabd_s, float32_abd, float32)
1490 DO_3OP(gvec_fabd_d, float64_abd, float64)
1491 
1492 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16)
1493 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32)
1494 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64)
1495 
1496 DO_3OP(gvec_fceq_h, float16_ceq, float16)
1497 DO_3OP(gvec_fceq_s, float32_ceq, float32)
1498 DO_3OP(gvec_fceq_d, float64_ceq, float64)
1499 
1500 DO_3OP(gvec_fcge_h, float16_cge, float16)
1501 DO_3OP(gvec_fcge_s, float32_cge, float32)
1502 DO_3OP(gvec_fcge_d, float64_cge, float64)
1503 
1504 DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1505 DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1506 DO_3OP(gvec_fcgt_d, float64_cgt, float64)
1507 
1508 DO_3OP(gvec_facge_h, float16_acge, float16)
1509 DO_3OP(gvec_facge_s, float32_acge, float32)
1510 DO_3OP(gvec_facge_d, float64_acge, float64)
1511 
1512 DO_3OP(gvec_facgt_h, float16_acgt, float16)
1513 DO_3OP(gvec_facgt_s, float32_acgt, float32)
1514 DO_3OP(gvec_facgt_d, float64_acgt, float64)
1515 
1516 DO_3OP(gvec_fmax_h, float16_max, float16)
1517 DO_3OP(gvec_fmax_s, float32_max, float32)
1518 DO_3OP(gvec_fmax_d, float64_max, float64)
1519 
1520 DO_3OP(gvec_fmin_h, float16_min, float16)
1521 DO_3OP(gvec_fmin_s, float32_min, float32)
1522 DO_3OP(gvec_fmin_d, float64_min, float64)
1523 
1524 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1525 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1526 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64)
1527 
1528 DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1529 DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1530 DO_3OP(gvec_fminnum_d, float64_minnum, float64)
1531 
1532 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1533 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1534 
1535 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1536 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1537 
1538 #ifdef TARGET_AARCH64
1539 DO_3OP(gvec_fdiv_h, float16_div, float16)
1540 DO_3OP(gvec_fdiv_s, float32_div, float32)
1541 DO_3OP(gvec_fdiv_d, float64_div, float64)
1542 
1543 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16)
1544 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32)
1545 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64)
1546 
1547 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1548 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1549 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1550 
1551 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1552 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1553 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1554 
1555 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16)
1556 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32)
1557 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64)
1558 
1559 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16)
1560 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32)
1561 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64)
1562 
1563 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16)
1564 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32)
1565 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64)
1566 
1567 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16)
1568 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32)
1569 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64)
1570 
1571 DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16)
1572 DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16)
1573 DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16)
1574 DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16)
1575 DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16)
1576 DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16)
1577 
1578 #endif
1579 #undef DO_3OP
1580 
1581 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1582 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1583                                  float_status *stat)
1584 {
1585     return float16_add(dest, float16_mul(op1, op2, stat), stat);
1586 }
1587 
1588 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1589                                  float_status *stat)
1590 {
1591     return float32_add(dest, float32_mul(op1, op2, stat), stat);
1592 }
1593 
1594 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1595                                  float_status *stat)
1596 {
1597     return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1598 }
1599 
1600 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1601                                  float_status *stat)
1602 {
1603     return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1604 }
1605 
1606 /* Fused versions; these have the semantics Neon VFMA/VFMS want */
1607 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1608                                 float_status *stat)
1609 {
1610     return float16_muladd(op1, op2, dest, 0, stat);
1611 }
1612 
1613 static bfloat16 bfloat16_muladd_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1614                                   float_status *stat)
1615 {
1616     return bfloat16_muladd(op1, op2, dest, 0, stat);
1617 }
1618 
1619 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1620                                  float_status *stat)
1621 {
1622     return float32_muladd(op1, op2, dest, 0, stat);
1623 }
1624 
1625 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2,
1626                                  float_status *stat)
1627 {
1628     return float64_muladd(op1, op2, dest, 0, stat);
1629 }
1630 
1631 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1632                                  float_status *stat)
1633 {
1634     return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1635 }
1636 
1637 static bfloat16 bfloat16_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1638                                   float_status *stat)
1639 {
1640     return bfloat16_muladd(bfloat16_chs(op1), op2, dest, 0, stat);
1641 }
1642 
1643 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1644                                  float_status *stat)
1645 {
1646     return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1647 }
1648 
1649 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2,
1650                                  float_status *stat)
1651 {
1652     return float64_muladd(float64_chs(op1), op2, dest, 0, stat);
1653 }
1654 
1655 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2,
1656                                  float_status *stat)
1657 {
1658     return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1659 }
1660 
1661 static bfloat16 bfloat16_ah_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2,
1662                                      float_status *stat)
1663 {
1664     return bfloat16_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1665 }
1666 
1667 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2,
1668                                  float_status *stat)
1669 {
1670     return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1671 }
1672 
1673 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2,
1674                                  float_status *stat)
1675 {
1676     return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat);
1677 }
1678 
1679 #define DO_MULADD(NAME, FUNC, TYPE)                                        \
1680 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1681                   float_status *stat, uint32_t desc)                       \
1682 {                                                                          \
1683     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1684     TYPE *d = vd, *n = vn, *m = vm;                                        \
1685     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
1686         d[i] = FUNC(d[i], n[i], m[i], stat);                               \
1687     }                                                                      \
1688     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1689 }
1690 
1691 DO_MULADD(gvec_fmla_nf_h, float16_muladd_nf, float16)
1692 DO_MULADD(gvec_fmla_nf_s, float32_muladd_nf, float32)
1693 
1694 DO_MULADD(gvec_fmls_nf_h, float16_mulsub_nf, float16)
1695 DO_MULADD(gvec_fmls_nf_s, float32_mulsub_nf, float32)
1696 
1697 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1698 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1699 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64)
1700 DO_MULADD(gvec_bfmla, bfloat16_muladd_f, bfloat16)
1701 
1702 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1703 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1704 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64)
1705 DO_MULADD(gvec_bfmls, bfloat16_mulsub_f, bfloat16)
1706 
1707 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16)
1708 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32)
1709 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64)
1710 DO_MULADD(gvec_ah_bfmls, bfloat16_ah_mulsub_f, bfloat16)
1711 
1712 #undef DO_MULADD
1713 
1714 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
1715  * For AdvSIMD, there is of course only one such vector segment.
1716  */
1717 
1718 #define DO_MUL_IDX(NAME, TYPE, H) \
1719 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1720 {                                                                          \
1721     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1722     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1723     intptr_t idx = simd_data(desc);                                        \
1724     TYPE *d = vd, *n = vn, *m = vm;                                        \
1725     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1726         TYPE mm = m[H(i + idx)];                                           \
1727         for (j = 0; j < segment; j++) {                                    \
1728             d[i + j] = n[i + j] * mm;                                      \
1729         }                                                                  \
1730     }                                                                      \
1731     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1732 }
1733 
1734 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1735 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
1736 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
1737 
1738 #undef DO_MUL_IDX
1739 
1740 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
1741 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
1742 {                                                                          \
1743     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1744     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1745     intptr_t idx = simd_data(desc);                                        \
1746     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1747     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1748         TYPE mm = m[H(i + idx)];                                           \
1749         for (j = 0; j < segment; j++) {                                    \
1750             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
1751         }                                                                  \
1752     }                                                                      \
1753     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1754 }
1755 
1756 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1757 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
1758 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
1759 
1760 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1761 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
1762 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
1763 
1764 #undef DO_MLA_IDX
1765 
1766 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H)                               \
1767 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
1768                   float_status *stat, uint32_t desc)                       \
1769 {                                                                          \
1770     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1771     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1772     intptr_t idx = simd_data(desc);                                        \
1773     TYPE *d = vd, *n = vn, *m = vm;                                        \
1774     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1775         TYPE mm = m[H(i + idx)];                                           \
1776         for (j = 0; j < segment; j++) {                                    \
1777             d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat);       \
1778         }                                                                  \
1779     }                                                                      \
1780     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1781 }
1782 
1783 #define nop(N, M, S) (M)
1784 
1785 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2)
1786 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4)
1787 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8)
1788 
1789 #ifdef TARGET_AARCH64
1790 
1791 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2)
1792 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4)
1793 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8)
1794 
1795 #endif
1796 
1797 #undef nop
1798 
1799 /*
1800  * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1801  * the fused ops below they assume accumulate both from and into Vd.
1802  */
1803 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2)
1804 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4)
1805 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2)
1806 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4)
1807 
1808 #undef DO_FMUL_IDX
1809 
1810 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF)                             \
1811 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
1812                   float_status *stat, uint32_t desc)                       \
1813 {                                                                          \
1814     intptr_t i, j, oprsz = simd_oprsz(desc);                               \
1815     intptr_t segment = MIN(16, oprsz) / sizeof(TYPE);                      \
1816     intptr_t idx = simd_data(desc);                                        \
1817     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
1818     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
1819         TYPE mm = m[H(i + idx)];                                           \
1820         for (j = 0; j < segment; j++) {                                    \
1821             d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm,                  \
1822                                      a[i + j], NEGF, stat);                \
1823         }                                                                  \
1824     }                                                                      \
1825     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1826 }
1827 
1828 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0)
1829 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0)
1830 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0)
1831 DO_FMLA_IDX(gvec_bfmla_idx, bfloat16, H2, 0, 0)
1832 
1833 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0)
1834 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0)
1835 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0)
1836 DO_FMLA_IDX(gvec_bfmls_idx, bfloat16, H2, INT16_MIN, 0)
1837 
1838 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product)
1839 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product)
1840 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product)
1841 DO_FMLA_IDX(gvec_ah_bfmls_idx, bfloat16, H2, 0, float_muladd_negate_product)
1842 
1843 #undef DO_FMLA_IDX
1844 
1845 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1846 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
1847 {                                                                          \
1848     intptr_t i, oprsz = simd_oprsz(desc);                                  \
1849     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
1850     bool q = false;                                                        \
1851     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
1852         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
1853         if (dd < MIN) {                                                    \
1854             dd = MIN;                                                      \
1855             q = true;                                                      \
1856         } else if (dd > MAX) {                                             \
1857             dd = MAX;                                                      \
1858             q = true;                                                      \
1859         }                                                                  \
1860         d[i] = dd;                                                         \
1861     }                                                                      \
1862     if (q) {                                                               \
1863         uint32_t *qc = vq;                                                 \
1864         qc[0] = 1;                                                         \
1865     }                                                                      \
1866     clear_tail(d, oprsz, simd_maxsz(desc));                                \
1867 }
1868 
1869 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1870 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1871 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1872 
1873 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1874 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1875 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1876 
1877 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1878 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1879 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1880 
1881 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1882 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1883 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1884 
1885 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX)
1886 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX)
1887 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX)
1888 
1889 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX)
1890 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX)
1891 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX)
1892 
1893 #undef DO_SAT
1894 
1895 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1896                           void *vm, uint32_t desc)
1897 {
1898     intptr_t i, oprsz = simd_oprsz(desc);
1899     uint64_t *d = vd, *n = vn, *m = vm;
1900     bool q = false;
1901 
1902     for (i = 0; i < oprsz / 8; i++) {
1903         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1904         if (dd < nn) {
1905             dd = UINT64_MAX;
1906             q = true;
1907         }
1908         d[i] = dd;
1909     }
1910     if (q) {
1911         uint32_t *qc = vq;
1912         qc[0] = 1;
1913     }
1914     clear_tail(d, oprsz, simd_maxsz(desc));
1915 }
1916 
1917 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1918                           void *vm, uint32_t desc)
1919 {
1920     intptr_t i, oprsz = simd_oprsz(desc);
1921     uint64_t *d = vd, *n = vn, *m = vm;
1922     bool q = false;
1923 
1924     for (i = 0; i < oprsz / 8; i++) {
1925         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1926         if (nn < mm) {
1927             dd = 0;
1928             q = true;
1929         }
1930         d[i] = dd;
1931     }
1932     if (q) {
1933         uint32_t *qc = vq;
1934         qc[0] = 1;
1935     }
1936     clear_tail(d, oprsz, simd_maxsz(desc));
1937 }
1938 
1939 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1940                           void *vm, uint32_t desc)
1941 {
1942     intptr_t i, oprsz = simd_oprsz(desc);
1943     int64_t *d = vd, *n = vn, *m = vm;
1944     bool q = false;
1945 
1946     for (i = 0; i < oprsz / 8; i++) {
1947         int64_t nn = n[i], mm = m[i], dd = nn + mm;
1948         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1949             dd = (nn >> 63) ^ ~INT64_MIN;
1950             q = true;
1951         }
1952         d[i] = dd;
1953     }
1954     if (q) {
1955         uint32_t *qc = vq;
1956         qc[0] = 1;
1957     }
1958     clear_tail(d, oprsz, simd_maxsz(desc));
1959 }
1960 
1961 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1962                           void *vm, uint32_t desc)
1963 {
1964     intptr_t i, oprsz = simd_oprsz(desc);
1965     int64_t *d = vd, *n = vn, *m = vm;
1966     bool q = false;
1967 
1968     for (i = 0; i < oprsz / 8; i++) {
1969         int64_t nn = n[i], mm = m[i], dd = nn - mm;
1970         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1971             dd = (nn >> 63) ^ ~INT64_MIN;
1972             q = true;
1973         }
1974         d[i] = dd;
1975     }
1976     if (q) {
1977         uint32_t *qc = vq;
1978         qc[0] = 1;
1979     }
1980     clear_tail(d, oprsz, simd_maxsz(desc));
1981 }
1982 
1983 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn,
1984                            void *vm, uint32_t desc)
1985 {
1986     intptr_t i, oprsz = simd_oprsz(desc);
1987     uint64_t *d = vd, *n = vn, *m = vm;
1988     bool q = false;
1989 
1990     for (i = 0; i < oprsz / 8; i++) {
1991         uint64_t nn = n[i];
1992         int64_t mm = m[i];
1993         uint64_t dd = nn + mm;
1994 
1995         if (mm < 0) {
1996             if (nn < (uint64_t)-mm) {
1997                 dd = 0;
1998                 q = true;
1999             }
2000         } else {
2001             if (dd < nn) {
2002                 dd = UINT64_MAX;
2003                 q = true;
2004             }
2005         }
2006         d[i] = dd;
2007     }
2008     if (q) {
2009         uint32_t *qc = vq;
2010         qc[0] = 1;
2011     }
2012     clear_tail(d, oprsz, simd_maxsz(desc));
2013 }
2014 
2015 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn,
2016                            void *vm, uint32_t desc)
2017 {
2018     intptr_t i, oprsz = simd_oprsz(desc);
2019     uint64_t *d = vd, *n = vn, *m = vm;
2020     bool q = false;
2021 
2022     for (i = 0; i < oprsz / 8; i++) {
2023         int64_t nn = n[i];
2024         uint64_t mm = m[i];
2025         int64_t dd = nn + mm;
2026 
2027         if (mm > (uint64_t)(INT64_MAX - nn)) {
2028             dd = INT64_MAX;
2029             q = true;
2030         }
2031         d[i] = dd;
2032     }
2033     if (q) {
2034         uint32_t *qc = vq;
2035         qc[0] = 1;
2036     }
2037     clear_tail(d, oprsz, simd_maxsz(desc));
2038 }
2039 
2040 #define DO_SRA(NAME, TYPE)                              \
2041 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2042 {                                                       \
2043     intptr_t i, oprsz = simd_oprsz(desc);               \
2044     int shift = simd_data(desc);                        \
2045     TYPE *d = vd, *n = vn;                              \
2046     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2047         d[i] += n[i] >> shift;                          \
2048     }                                                   \
2049     clear_tail(d, oprsz, simd_maxsz(desc));             \
2050 }
2051 
2052 DO_SRA(gvec_ssra_b, int8_t)
2053 DO_SRA(gvec_ssra_h, int16_t)
2054 DO_SRA(gvec_ssra_s, int32_t)
2055 DO_SRA(gvec_ssra_d, int64_t)
2056 
2057 DO_SRA(gvec_usra_b, uint8_t)
2058 DO_SRA(gvec_usra_h, uint16_t)
2059 DO_SRA(gvec_usra_s, uint32_t)
2060 DO_SRA(gvec_usra_d, uint64_t)
2061 
2062 #undef DO_SRA
2063 
2064 #define DO_RSHR(NAME, TYPE)                             \
2065 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2066 {                                                       \
2067     intptr_t i, oprsz = simd_oprsz(desc);               \
2068     int shift = simd_data(desc);                        \
2069     TYPE *d = vd, *n = vn;                              \
2070     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2071         TYPE tmp = n[i] >> (shift - 1);                 \
2072         d[i] = (tmp >> 1) + (tmp & 1);                  \
2073     }                                                   \
2074     clear_tail(d, oprsz, simd_maxsz(desc));             \
2075 }
2076 
2077 DO_RSHR(gvec_srshr_b, int8_t)
2078 DO_RSHR(gvec_srshr_h, int16_t)
2079 DO_RSHR(gvec_srshr_s, int32_t)
2080 DO_RSHR(gvec_srshr_d, int64_t)
2081 
2082 DO_RSHR(gvec_urshr_b, uint8_t)
2083 DO_RSHR(gvec_urshr_h, uint16_t)
2084 DO_RSHR(gvec_urshr_s, uint32_t)
2085 DO_RSHR(gvec_urshr_d, uint64_t)
2086 
2087 #undef DO_RSHR
2088 
2089 #define DO_RSRA(NAME, TYPE)                             \
2090 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2091 {                                                       \
2092     intptr_t i, oprsz = simd_oprsz(desc);               \
2093     int shift = simd_data(desc);                        \
2094     TYPE *d = vd, *n = vn;                              \
2095     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2096         TYPE tmp = n[i] >> (shift - 1);                 \
2097         d[i] += (tmp >> 1) + (tmp & 1);                 \
2098     }                                                   \
2099     clear_tail(d, oprsz, simd_maxsz(desc));             \
2100 }
2101 
2102 DO_RSRA(gvec_srsra_b, int8_t)
2103 DO_RSRA(gvec_srsra_h, int16_t)
2104 DO_RSRA(gvec_srsra_s, int32_t)
2105 DO_RSRA(gvec_srsra_d, int64_t)
2106 
2107 DO_RSRA(gvec_ursra_b, uint8_t)
2108 DO_RSRA(gvec_ursra_h, uint16_t)
2109 DO_RSRA(gvec_ursra_s, uint32_t)
2110 DO_RSRA(gvec_ursra_d, uint64_t)
2111 
2112 #undef DO_RSRA
2113 
2114 #define DO_SRI(NAME, TYPE)                              \
2115 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2116 {                                                       \
2117     intptr_t i, oprsz = simd_oprsz(desc);               \
2118     int shift = simd_data(desc);                        \
2119     TYPE *d = vd, *n = vn;                              \
2120     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2121         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
2122     }                                                   \
2123     clear_tail(d, oprsz, simd_maxsz(desc));             \
2124 }
2125 
2126 DO_SRI(gvec_sri_b, uint8_t)
2127 DO_SRI(gvec_sri_h, uint16_t)
2128 DO_SRI(gvec_sri_s, uint32_t)
2129 DO_SRI(gvec_sri_d, uint64_t)
2130 
2131 #undef DO_SRI
2132 
2133 #define DO_SLI(NAME, TYPE)                              \
2134 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2135 {                                                       \
2136     intptr_t i, oprsz = simd_oprsz(desc);               \
2137     int shift = simd_data(desc);                        \
2138     TYPE *d = vd, *n = vn;                              \
2139     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
2140         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
2141     }                                                   \
2142     clear_tail(d, oprsz, simd_maxsz(desc));             \
2143 }
2144 
2145 DO_SLI(gvec_sli_b, uint8_t)
2146 DO_SLI(gvec_sli_h, uint16_t)
2147 DO_SLI(gvec_sli_s, uint32_t)
2148 DO_SLI(gvec_sli_d, uint64_t)
2149 
2150 #undef DO_SLI
2151 
2152 /*
2153  * Convert float16 to float32, raising no exceptions and
2154  * preserving exceptional values, including SNaN.
2155  * This is effectively an unpack+repack operation.
2156  */
2157 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
2158 {
2159     const int f16_bias = 15;
2160     const int f32_bias = 127;
2161     uint32_t sign = extract32(f16, 15, 1);
2162     uint32_t exp = extract32(f16, 10, 5);
2163     uint32_t frac = extract32(f16, 0, 10);
2164 
2165     if (exp == 0x1f) {
2166         /* Inf or NaN */
2167         exp = 0xff;
2168     } else if (exp == 0) {
2169         /* Zero or denormal.  */
2170         if (frac != 0) {
2171             if (fz16) {
2172                 frac = 0;
2173             } else {
2174                 /*
2175                  * Denormal; these are all normal float32.
2176                  * Shift the fraction so that the msb is at bit 11,
2177                  * then remove bit 11 as the implicit bit of the
2178                  * normalized float32.  Note that we still go through
2179                  * the shift for normal numbers below, to put the
2180                  * float32 fraction at the right place.
2181                  */
2182                 int shift = clz32(frac) - 21;
2183                 frac = (frac << shift) & 0x3ff;
2184                 exp = f32_bias - f16_bias - shift + 1;
2185             }
2186         }
2187     } else {
2188         /* Normal number; adjust the bias.  */
2189         exp += f32_bias - f16_bias;
2190     }
2191     sign <<= 31;
2192     exp <<= 23;
2193     frac <<= 23 - 10;
2194 
2195     return sign | exp | frac;
2196 }
2197 
2198 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
2199 {
2200     /*
2201      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
2202      * Load the 2nd qword iff is_q & is_2.
2203      * Shift to the 2nd dword iff !is_q & is_2.
2204      * For !is_q & !is_2, the upper bits of the result are garbage.
2205      */
2206     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
2207 }
2208 
2209 /*
2210  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
2211  * as there is not yet SVE versions that might use blocking.
2212  */
2213 
2214 static void do_fmlal(float32 *d, void *vn, void *vm,
2215                      CPUARMState *env, uint32_t desc,
2216                      ARMFPStatusFlavour fpst_idx,
2217                      uint64_t negx, int negf)
2218 {
2219     float_status *fpst = &env->vfp.fp_status[fpst_idx];
2220     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2221     intptr_t i, oprsz = simd_oprsz(desc);
2222     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2223     int is_q = oprsz == 16;
2224     uint64_t n_4, m_4;
2225 
2226     /*
2227      * Pre-load all of the f16 data, avoiding overlap issues.
2228      * Negate all inputs for AH=0 FMLSL at once.
2229      */
2230     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2231     m_4 = load4_f16(vm, is_q, is_2);
2232 
2233     for (i = 0; i < oprsz / 4; i++) {
2234         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2235         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
2236         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2237     }
2238     clear_tail(d, oprsz, simd_maxsz(desc));
2239 }
2240 
2241 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
2242                             CPUARMState *env, uint32_t desc)
2243 {
2244     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2245     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2246 
2247     do_fmlal(vd, vn, vm, env, desc, FPST_STD, negx, 0);
2248 }
2249 
2250 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
2251                             CPUARMState *env, uint32_t desc)
2252 {
2253     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2254     uint64_t negx = 0;
2255     int negf = 0;
2256 
2257     if (is_s) {
2258         if (env->vfp.fpcr & FPCR_AH) {
2259             negf = float_muladd_negate_product;
2260         } else {
2261             negx = 0x8000800080008000ull;
2262         }
2263     }
2264     do_fmlal(vd, vn, vm, env, desc, FPST_A64, negx, negf);
2265 }
2266 
2267 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
2268                                CPUARMState *env, uint32_t desc)
2269 {
2270     intptr_t i, oprsz = simd_oprsz(desc);
2271     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2272     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2273     bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
2274     float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
2275     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2276     int negx = 0, negf = 0;
2277 
2278     if (is_s) {
2279         if (env->vfp.fpcr & FPCR_AH) {
2280             negf = float_muladd_negate_product;
2281         } else {
2282             negx = 0x8000;
2283         }
2284     }
2285 
2286     for (i = 0; i < oprsz; i += sizeof(float32)) {
2287         float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx;
2288         float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
2289         float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2290         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2291         float32 aa = *(float32 *)(va + H1_4(i));
2292 
2293         *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status);
2294     }
2295 }
2296 
2297 static void do_fmlal_idx(float32 *d, void *vn, void *vm,
2298                          CPUARMState *env, uint32_t desc,
2299                          ARMFPStatusFlavour fpst_idx,
2300                          uint64_t negx, int negf)
2301 {
2302     float_status *fpst = &env->vfp.fp_status[fpst_idx];
2303     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2304     intptr_t i, oprsz = simd_oprsz(desc);
2305     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2306     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
2307     int is_q = oprsz == 16;
2308     uint64_t n_4;
2309     float32 m_1;
2310 
2311     /*
2312      * Pre-load all of the f16 data, avoiding overlap issues.
2313      * Negate all inputs for AH=0 FMLSL at once.
2314      */
2315     n_4 = load4_f16(vn, is_q, is_2) ^ negx;
2316     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
2317 
2318     for (i = 0; i < oprsz / 4; i++) {
2319         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
2320         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst);
2321     }
2322     clear_tail(d, oprsz, simd_maxsz(desc));
2323 }
2324 
2325 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
2326                                 CPUARMState *env, uint32_t desc)
2327 {
2328     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2329     uint64_t negx = is_s ? 0x8000800080008000ull : 0;
2330 
2331     do_fmlal_idx(vd, vn, vm, env, desc, FPST_STD, negx, 0);
2332 }
2333 
2334 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
2335                                 CPUARMState *env, uint32_t desc)
2336 {
2337     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2338     uint64_t negx = 0;
2339     int negf = 0;
2340 
2341     if (is_s) {
2342         if (env->vfp.fpcr & FPCR_AH) {
2343             negf = float_muladd_negate_product;
2344         } else {
2345             negx = 0x8000800080008000ull;
2346         }
2347     }
2348     do_fmlal_idx(vd, vn, vm, env, desc, FPST_A64, negx, negf);
2349 }
2350 
2351 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
2352                                CPUARMState *env, uint32_t desc)
2353 {
2354     intptr_t i, j, oprsz = simd_oprsz(desc);
2355     bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
2356     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
2357     bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
2358     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 3, 3) * sizeof(float16);
2359     float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64];
2360     bool fz16 = env->vfp.fpcr & FPCR_FZ16;
2361     int negx = 0, negf = 0;
2362 
2363     if (is_s) {
2364         if (env->vfp.fpcr & FPCR_AH) {
2365             negf = float_muladd_negate_product;
2366         } else {
2367             negx = 0x8000;
2368         }
2369     }
2370     for (i = 0; i < oprsz; i += 16) {
2371         float16 mm_16 = *(float16 *)(vm + i + idx);
2372         float32 mm = float16_to_float32_by_bits(mm_16, fz16);
2373 
2374         for (j = 0; j < 16; j += sizeof(float32)) {
2375             float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx;
2376             float32 nn = float16_to_float32_by_bits(nn_16, fz16);
2377             float32 aa = *(float32 *)(va + H1_4(i + j));
2378 
2379             *(float32 *)(vd + H1_4(i + j)) =
2380                 float32_muladd(nn, mm, aa, negf, status);
2381         }
2382     }
2383 }
2384 
2385 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2386 {
2387     intptr_t i, opr_sz = simd_oprsz(desc);
2388     int8_t *d = vd, *n = vn, *m = vm;
2389 
2390     for (i = 0; i < opr_sz; ++i) {
2391         int8_t mm = m[i];
2392         int8_t nn = n[i];
2393         int8_t res = 0;
2394         if (mm >= 0) {
2395             if (mm < 8) {
2396                 res = nn << mm;
2397             }
2398         } else {
2399             res = nn >> (mm > -8 ? -mm : 7);
2400         }
2401         d[i] = res;
2402     }
2403     clear_tail(d, opr_sz, simd_maxsz(desc));
2404 }
2405 
2406 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2407 {
2408     intptr_t i, opr_sz = simd_oprsz(desc);
2409     int16_t *d = vd, *n = vn, *m = vm;
2410 
2411     for (i = 0; i < opr_sz / 2; ++i) {
2412         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2413         int16_t nn = n[i];
2414         int16_t res = 0;
2415         if (mm >= 0) {
2416             if (mm < 16) {
2417                 res = nn << mm;
2418             }
2419         } else {
2420             res = nn >> (mm > -16 ? -mm : 15);
2421         }
2422         d[i] = res;
2423     }
2424     clear_tail(d, opr_sz, simd_maxsz(desc));
2425 }
2426 
2427 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
2428 {
2429     intptr_t i, opr_sz = simd_oprsz(desc);
2430     uint8_t *d = vd, *n = vn, *m = vm;
2431 
2432     for (i = 0; i < opr_sz; ++i) {
2433         int8_t mm = m[i];
2434         uint8_t nn = n[i];
2435         uint8_t res = 0;
2436         if (mm >= 0) {
2437             if (mm < 8) {
2438                 res = nn << mm;
2439             }
2440         } else {
2441             if (mm > -8) {
2442                 res = nn >> -mm;
2443             }
2444         }
2445         d[i] = res;
2446     }
2447     clear_tail(d, opr_sz, simd_maxsz(desc));
2448 }
2449 
2450 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
2451 {
2452     intptr_t i, opr_sz = simd_oprsz(desc);
2453     uint16_t *d = vd, *n = vn, *m = vm;
2454 
2455     for (i = 0; i < opr_sz / 2; ++i) {
2456         int8_t mm = m[i];   /* only 8 bits of shift are significant */
2457         uint16_t nn = n[i];
2458         uint16_t res = 0;
2459         if (mm >= 0) {
2460             if (mm < 16) {
2461                 res = nn << mm;
2462             }
2463         } else {
2464             if (mm > -16) {
2465                 res = nn >> -mm;
2466             }
2467         }
2468         d[i] = res;
2469     }
2470     clear_tail(d, opr_sz, simd_maxsz(desc));
2471 }
2472 
2473 /*
2474  * 8x8->8 polynomial multiply.
2475  *
2476  * Polynomial multiplication is like integer multiplication except the
2477  * partial products are XORed, not added.
2478  *
2479  * TODO: expose this as a generic vector operation, as it is a common
2480  * crypto building block.
2481  */
2482 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
2483 {
2484     intptr_t i, opr_sz = simd_oprsz(desc);
2485     uint64_t *d = vd, *n = vn, *m = vm;
2486 
2487     for (i = 0; i < opr_sz / 8; ++i) {
2488         d[i] = clmul_8x8_low(n[i], m[i]);
2489     }
2490     clear_tail(d, opr_sz, simd_maxsz(desc));
2491 }
2492 
2493 /*
2494  * 64x64->128 polynomial multiply.
2495  * Because of the lanes are not accessed in strict columns,
2496  * this probably cannot be turned into a generic helper.
2497  */
2498 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2499 {
2500     intptr_t i, opr_sz = simd_oprsz(desc);
2501     intptr_t hi = simd_data(desc);
2502     uint64_t *d = vd, *n = vn, *m = vm;
2503 
2504     for (i = 0; i < opr_sz / 8; i += 2) {
2505         Int128 r = clmul_64(n[i + hi], m[i + hi]);
2506         d[i] = int128_getlo(r);
2507         d[i + 1] = int128_gethi(r);
2508     }
2509     clear_tail(d, opr_sz, simd_maxsz(desc));
2510 }
2511 
2512 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2513 {
2514     int hi = simd_data(desc);
2515     uint64_t *d = vd, *n = vn, *m = vm;
2516     uint64_t nn = n[hi], mm = m[hi];
2517 
2518     d[0] = clmul_8x4_packed(nn, mm);
2519     nn >>= 32;
2520     mm >>= 32;
2521     d[1] = clmul_8x4_packed(nn, mm);
2522 
2523     clear_tail(d, 16, simd_maxsz(desc));
2524 }
2525 
2526 #ifdef TARGET_AARCH64
2527 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2528 {
2529     int shift = simd_data(desc) * 8;
2530     intptr_t i, opr_sz = simd_oprsz(desc);
2531     uint64_t *d = vd, *n = vn, *m = vm;
2532 
2533     for (i = 0; i < opr_sz / 8; ++i) {
2534         d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
2535     }
2536 }
2537 
2538 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2539 {
2540     intptr_t sel = H4(simd_data(desc));
2541     intptr_t i, opr_sz = simd_oprsz(desc);
2542     uint32_t *n = vn, *m = vm;
2543     uint64_t *d = vd;
2544 
2545     for (i = 0; i < opr_sz / 8; ++i) {
2546         d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
2547     }
2548 }
2549 #endif
2550 
2551 #define DO_CMP0(NAME, TYPE, OP)                         \
2552 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
2553 {                                                       \
2554     intptr_t i, opr_sz = simd_oprsz(desc);              \
2555     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
2556         TYPE nn = *(TYPE *)(vn + i);                    \
2557         *(TYPE *)(vd + i) = -(nn OP 0);                 \
2558     }                                                   \
2559     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
2560 }
2561 
2562 DO_CMP0(gvec_ceq0_b, int8_t, ==)
2563 DO_CMP0(gvec_clt0_b, int8_t, <)
2564 DO_CMP0(gvec_cle0_b, int8_t, <=)
2565 DO_CMP0(gvec_cgt0_b, int8_t, >)
2566 DO_CMP0(gvec_cge0_b, int8_t, >=)
2567 
2568 DO_CMP0(gvec_ceq0_h, int16_t, ==)
2569 DO_CMP0(gvec_clt0_h, int16_t, <)
2570 DO_CMP0(gvec_cle0_h, int16_t, <=)
2571 DO_CMP0(gvec_cgt0_h, int16_t, >)
2572 DO_CMP0(gvec_cge0_h, int16_t, >=)
2573 
2574 #undef DO_CMP0
2575 
2576 #define DO_ABD(NAME, TYPE)                                      \
2577 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2578 {                                                               \
2579     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2580     TYPE *d = vd, *n = vn, *m = vm;                             \
2581                                                                 \
2582     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2583         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
2584     }                                                           \
2585     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2586 }
2587 
2588 DO_ABD(gvec_sabd_b, int8_t)
2589 DO_ABD(gvec_sabd_h, int16_t)
2590 DO_ABD(gvec_sabd_s, int32_t)
2591 DO_ABD(gvec_sabd_d, int64_t)
2592 
2593 DO_ABD(gvec_uabd_b, uint8_t)
2594 DO_ABD(gvec_uabd_h, uint16_t)
2595 DO_ABD(gvec_uabd_s, uint32_t)
2596 DO_ABD(gvec_uabd_d, uint64_t)
2597 
2598 #undef DO_ABD
2599 
2600 #define DO_ABA(NAME, TYPE)                                      \
2601 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2602 {                                                               \
2603     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2604     TYPE *d = vd, *n = vn, *m = vm;                             \
2605                                                                 \
2606     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
2607         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
2608     }                                                           \
2609     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
2610 }
2611 
2612 DO_ABA(gvec_saba_b, int8_t)
2613 DO_ABA(gvec_saba_h, int16_t)
2614 DO_ABA(gvec_saba_s, int32_t)
2615 DO_ABA(gvec_saba_d, int64_t)
2616 
2617 DO_ABA(gvec_uaba_b, uint8_t)
2618 DO_ABA(gvec_uaba_h, uint16_t)
2619 DO_ABA(gvec_uaba_s, uint32_t)
2620 DO_ABA(gvec_uaba_d, uint64_t)
2621 
2622 #undef DO_ABA
2623 
2624 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2625 void HELPER(NAME)(void *vd, void *vn, void *vm,                            \
2626                   float_status *stat, uint32_t desc)                       \
2627 {                                                                          \
2628     ARMVectorReg scratch;                                                  \
2629     intptr_t oprsz = simd_oprsz(desc);                                     \
2630     intptr_t half = oprsz / sizeof(TYPE) / 2;                              \
2631     TYPE *d = vd, *n = vn, *m = vm;                                        \
2632     if (unlikely(d == m)) {                                                \
2633         m = memcpy(&scratch, m, oprsz);                                    \
2634     }                                                                      \
2635     for (intptr_t i = 0; i < half; ++i) {                                  \
2636         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat);                \
2637     }                                                                      \
2638     for (intptr_t i = 0; i < half; ++i) {                                  \
2639         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat);         \
2640     }                                                                      \
2641     clear_tail(d, oprsz, simd_maxsz(desc));                                \
2642 }
2643 
2644 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2)
2645 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4)
2646 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, )
2647 
2648 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2)
2649 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4)
2650 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, )
2651 
2652 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2)
2653 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4)
2654 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, )
2655 
2656 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2)
2657 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4)
2658 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, )
2659 
2660 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2)
2661 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4)
2662 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, )
2663 
2664 #ifdef TARGET_AARCH64
2665 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2)
2666 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4)
2667 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, )
2668 
2669 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2)
2670 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4)
2671 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, )
2672 #endif
2673 
2674 #undef DO_3OP_PAIR
2675 
2676 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \
2677 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
2678 {                                                               \
2679     ARMVectorReg scratch;                                       \
2680     intptr_t oprsz = simd_oprsz(desc);                          \
2681     intptr_t half = oprsz / sizeof(TYPE) / 2;                   \
2682     TYPE *d = vd, *n = vn, *m = vm;                             \
2683     if (unlikely(d == m)) {                                     \
2684         m = memcpy(&scratch, m, oprsz);                         \
2685     }                                                           \
2686     for (intptr_t i = 0; i < half; ++i) {                       \
2687         d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]);           \
2688     }                                                           \
2689     for (intptr_t i = 0; i < half; ++i) {                       \
2690         d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]);    \
2691     }                                                           \
2692     clear_tail(d, oprsz, simd_maxsz(desc));                     \
2693 }
2694 
2695 #define ADD(A, B) (A + B)
2696 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1)
2697 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2)
2698 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4)
2699 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, )
2700 #undef  ADD
2701 
2702 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1)
2703 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2)
2704 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4)
2705 
2706 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1)
2707 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2)
2708 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4)
2709 
2710 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1)
2711 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2)
2712 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4)
2713 
2714 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1)
2715 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2)
2716 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4)
2717 
2718 #undef DO_3OP_PAIR
2719 
2720 #define DO_VCVT_FIXED(NAME, FUNC, TYPE)                                 \
2721     void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \
2722     {                                                                   \
2723         intptr_t i, oprsz = simd_oprsz(desc);                           \
2724         int shift = simd_data(desc);                                    \
2725         TYPE *d = vd, *n = vn;                                          \
2726         float_status *fpst = stat;                                      \
2727         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2728             d[i] = FUNC(n[i], shift, fpst);                             \
2729         }                                                               \
2730         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2731     }
2732 
2733 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t)
2734 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t)
2735 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2736 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2737 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2738 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2739 
2740 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t)
2741 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t)
2742 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2743 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t)
2744 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2745 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t)
2746 
2747 #undef DO_VCVT_FIXED
2748 
2749 #define DO_VCVT_RMODE(NAME, FUNC, TYPE)                                 \
2750     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2751     {                                                                   \
2752         intptr_t i, oprsz = simd_oprsz(desc);                           \
2753         uint32_t rmode = simd_data(desc);                               \
2754         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2755         TYPE *d = vd, *n = vn;                                          \
2756         set_float_rounding_mode(rmode, fpst);                           \
2757         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2758             d[i] = FUNC(n[i], 0, fpst);                                 \
2759         }                                                               \
2760         set_float_rounding_mode(prev_rmode, fpst);                      \
2761         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2762     }
2763 
2764 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t)
2765 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t)
2766 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2767 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2768 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2769 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2770 
2771 #undef DO_VCVT_RMODE
2772 
2773 #define DO_VRINT_RMODE(NAME, FUNC, TYPE)                                \
2774     void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \
2775     {                                                                   \
2776         intptr_t i, oprsz = simd_oprsz(desc);                           \
2777         uint32_t rmode = simd_data(desc);                               \
2778         uint32_t prev_rmode = get_float_rounding_mode(fpst);            \
2779         TYPE *d = vd, *n = vn;                                          \
2780         set_float_rounding_mode(rmode, fpst);                           \
2781         for (i = 0; i < oprsz / sizeof(TYPE); i++) {                    \
2782             d[i] = FUNC(n[i], fpst);                                    \
2783         }                                                               \
2784         set_float_rounding_mode(prev_rmode, fpst);                      \
2785         clear_tail(d, oprsz, simd_maxsz(desc));                         \
2786     }
2787 
2788 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2789 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2790 
2791 #undef DO_VRINT_RMODE
2792 
2793 #ifdef TARGET_AARCH64
2794 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc)
2795 {
2796     const uint8_t *indices = vm;
2797     size_t oprsz = simd_oprsz(desc);
2798     uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2799     bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2800     uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2801     union {
2802         uint8_t b[16];
2803         uint64_t d[2];
2804     } result;
2805 
2806     /*
2807      * We must construct the final result in a temp, lest the output
2808      * overlaps the input table.  For TBL, begin with zero; for TBX,
2809      * begin with the original register contents.  Note that we always
2810      * copy 16 bytes here to avoid an extra branch; clearing the high
2811      * bits of the register for oprsz == 8 is handled below.
2812      */
2813     if (is_tbx) {
2814         memcpy(&result, vd, 16);
2815     } else {
2816         memset(&result, 0, 16);
2817     }
2818 
2819     for (size_t i = 0; i < oprsz; ++i) {
2820         uint32_t index = indices[H1(i)];
2821 
2822         if (index < table_len) {
2823             /*
2824              * Convert index (a byte offset into the virtual table
2825              * which is a series of 128-bit vectors concatenated)
2826              * into the correct register element, bearing in mind
2827              * that the table can wrap around from V31 to V0.
2828              */
2829             const uint8_t *table = (const uint8_t *)
2830                 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2831             result.b[H1(i)] = table[H1(index % 16)];
2832         }
2833     }
2834 
2835     memcpy(vd, &result, 16);
2836     clear_tail(vd, oprsz, simd_maxsz(desc));
2837 }
2838 #endif
2839 
2840 /*
2841  * NxN -> N highpart multiply
2842  *
2843  * TODO: expose this as a generic vector operation.
2844  */
2845 
2846 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2847 {
2848     intptr_t i, opr_sz = simd_oprsz(desc);
2849     int8_t *d = vd, *n = vn, *m = vm;
2850 
2851     for (i = 0; i < opr_sz; ++i) {
2852         d[i] = ((int32_t)n[i] * m[i]) >> 8;
2853     }
2854     clear_tail(d, opr_sz, simd_maxsz(desc));
2855 }
2856 
2857 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2858 {
2859     intptr_t i, opr_sz = simd_oprsz(desc);
2860     int16_t *d = vd, *n = vn, *m = vm;
2861 
2862     for (i = 0; i < opr_sz / 2; ++i) {
2863         d[i] = ((int32_t)n[i] * m[i]) >> 16;
2864     }
2865     clear_tail(d, opr_sz, simd_maxsz(desc));
2866 }
2867 
2868 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2869 {
2870     intptr_t i, opr_sz = simd_oprsz(desc);
2871     int32_t *d = vd, *n = vn, *m = vm;
2872 
2873     for (i = 0; i < opr_sz / 4; ++i) {
2874         d[i] = ((int64_t)n[i] * m[i]) >> 32;
2875     }
2876     clear_tail(d, opr_sz, simd_maxsz(desc));
2877 }
2878 
2879 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2880 {
2881     intptr_t i, opr_sz = simd_oprsz(desc);
2882     uint64_t *d = vd, *n = vn, *m = vm;
2883     uint64_t discard;
2884 
2885     for (i = 0; i < opr_sz / 8; ++i) {
2886         muls64(&discard, &d[i], n[i], m[i]);
2887     }
2888     clear_tail(d, opr_sz, simd_maxsz(desc));
2889 }
2890 
2891 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2892 {
2893     intptr_t i, opr_sz = simd_oprsz(desc);
2894     uint8_t *d = vd, *n = vn, *m = vm;
2895 
2896     for (i = 0; i < opr_sz; ++i) {
2897         d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2898     }
2899     clear_tail(d, opr_sz, simd_maxsz(desc));
2900 }
2901 
2902 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2903 {
2904     intptr_t i, opr_sz = simd_oprsz(desc);
2905     uint16_t *d = vd, *n = vn, *m = vm;
2906 
2907     for (i = 0; i < opr_sz / 2; ++i) {
2908         d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2909     }
2910     clear_tail(d, opr_sz, simd_maxsz(desc));
2911 }
2912 
2913 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2914 {
2915     intptr_t i, opr_sz = simd_oprsz(desc);
2916     uint32_t *d = vd, *n = vn, *m = vm;
2917 
2918     for (i = 0; i < opr_sz / 4; ++i) {
2919         d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2920     }
2921     clear_tail(d, opr_sz, simd_maxsz(desc));
2922 }
2923 
2924 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2925 {
2926     intptr_t i, opr_sz = simd_oprsz(desc);
2927     uint64_t *d = vd, *n = vn, *m = vm;
2928     uint64_t discard;
2929 
2930     for (i = 0; i < opr_sz / 8; ++i) {
2931         mulu64(&discard, &d[i], n[i], m[i]);
2932     }
2933     clear_tail(d, opr_sz, simd_maxsz(desc));
2934 }
2935 
2936 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2937 {
2938     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2939     int shr = simd_data(desc);
2940     uint64_t *d = vd, *n = vn, *m = vm;
2941 
2942     for (i = 0; i < opr_sz; ++i) {
2943         d[i] = ror64(n[i] ^ m[i], shr);
2944     }
2945     clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2946 }
2947 
2948 /*
2949  * Integer matrix-multiply accumulate
2950  */
2951 
2952 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2953 {
2954     int8_t *n = vn, *m = vm;
2955 
2956     for (intptr_t k = 0; k < 8; ++k) {
2957         sum += n[H1(k)] * m[H1(k)];
2958     }
2959     return sum;
2960 }
2961 
2962 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2963 {
2964     uint8_t *n = vn, *m = vm;
2965 
2966     for (intptr_t k = 0; k < 8; ++k) {
2967         sum += n[H1(k)] * m[H1(k)];
2968     }
2969     return sum;
2970 }
2971 
2972 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2973 {
2974     uint8_t *n = vn;
2975     int8_t *m = vm;
2976 
2977     for (intptr_t k = 0; k < 8; ++k) {
2978         sum += n[H1(k)] * m[H1(k)];
2979     }
2980     return sum;
2981 }
2982 
2983 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2984                       uint32_t (*inner_loop)(uint32_t, void *, void *))
2985 {
2986     intptr_t seg, opr_sz = simd_oprsz(desc);
2987 
2988     for (seg = 0; seg < opr_sz; seg += 16) {
2989         uint32_t *d = vd + seg;
2990         uint32_t *a = va + seg;
2991         uint32_t sum0, sum1, sum2, sum3;
2992 
2993         /*
2994          * Process the entire segment at once, writing back the
2995          * results only after we've consumed all of the inputs.
2996          *
2997          * Key to indices by column:
2998          *          i   j                  i             j
2999          */
3000         sum0 = a[H4(0 + 0)];
3001         sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
3002         sum1 = a[H4(0 + 1)];
3003         sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
3004         sum2 = a[H4(2 + 0)];
3005         sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
3006         sum3 = a[H4(2 + 1)];
3007         sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
3008 
3009         d[H4(0)] = sum0;
3010         d[H4(1)] = sum1;
3011         d[H4(2)] = sum2;
3012         d[H4(3)] = sum3;
3013     }
3014     clear_tail(vd, opr_sz, simd_maxsz(desc));
3015 }
3016 
3017 #define DO_MMLA_B(NAME, INNER) \
3018     void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
3019     { do_mmla_b(vd, vn, vm, va, desc, INNER); }
3020 
3021 DO_MMLA_B(gvec_smmla_b, do_smmla_b)
3022 DO_MMLA_B(gvec_ummla_b, do_ummla_b)
3023 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
3024 
3025 /*
3026  * BFloat16 Dot Product
3027  */
3028 
3029 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp)
3030 {
3031     /*
3032      * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF.
3033      * For EBF = 0, we ignore the FPCR bits which determine rounding
3034      * mode and denormal-flushing, and we do unfused multiplies and
3035      * additions with intermediate rounding of all products and sums.
3036      * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits,
3037      * and we perform a fused two-way sum-of-products without intermediate
3038      * rounding of the products.
3039      * In either case, we don't set fp exception flags.
3040      *
3041      * EBF is AArch64 only, so even if it's set in the FPCR it has
3042      * no effect on AArch32 instructions.
3043      */
3044     bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF;
3045 
3046     *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32];
3047     set_default_nan_mode(true, statusp);
3048 
3049     if (ebf) {
3050         /* EBF=1 needs to do a step with round-to-odd semantics */
3051         *oddstatusp = *statusp;
3052         set_float_rounding_mode(float_round_to_odd, oddstatusp);
3053     } else {
3054         set_flush_to_zero(true, statusp);
3055         set_flush_inputs_to_zero(true, statusp);
3056         set_float_rounding_mode(float_round_to_odd_inf, statusp);
3057     }
3058     return ebf;
3059 }
3060 
3061 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
3062 {
3063     float32 t1, t2;
3064 
3065     /*
3066      * Extract each BFloat16 from the element pair, and shift
3067      * them such that they become float32.
3068      */
3069     t1 = float32_mul(e1 << 16, e2 << 16, fpst);
3070     t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst);
3071     t1 = float32_add(t1, t2, fpst);
3072     t1 = float32_add(sum, t1, fpst);
3073 
3074     return t1;
3075 }
3076 
3077 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
3078                      float_status *fpst, float_status *fpst_odd)
3079 {
3080     float32 s1r = e1 << 16;
3081     float32 s1c = e1 & 0xffff0000u;
3082     float32 s2r = e2 << 16;
3083     float32 s2c = e2 & 0xffff0000u;
3084     float32 t32;
3085 
3086     /* C.f. FPProcessNaNs4 */
3087     if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) ||
3088         float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) {
3089         if (float32_is_signaling_nan(s1r, fpst)) {
3090             t32 = s1r;
3091         } else if (float32_is_signaling_nan(s1c, fpst)) {
3092             t32 = s1c;
3093         } else if (float32_is_signaling_nan(s2r, fpst)) {
3094             t32 = s2r;
3095         } else if (float32_is_signaling_nan(s2c, fpst)) {
3096             t32 = s2c;
3097         } else if (float32_is_any_nan(s1r)) {
3098             t32 = s1r;
3099         } else if (float32_is_any_nan(s1c)) {
3100             t32 = s1c;
3101         } else if (float32_is_any_nan(s2r)) {
3102             t32 = s2r;
3103         } else {
3104             t32 = s2c;
3105         }
3106         /*
3107          * FPConvertNaN(FPProcessNaN(t32)) will be done as part
3108          * of the final addition below.
3109          */
3110     } else {
3111         /*
3112          * Compare f16_dotadd() in sme_helper.c, but here we have
3113          * bfloat16 inputs. In particular that means that we do not
3114          * want the FPCR.FZ16 flush semantics, so we use the normal
3115          * float_status for the input handling here.
3116          */
3117         float64 e1r = float32_to_float64(s1r, fpst);
3118         float64 e1c = float32_to_float64(s1c, fpst);
3119         float64 e2r = float32_to_float64(s2r, fpst);
3120         float64 e2c = float32_to_float64(s2c, fpst);
3121         float64 t64;
3122 
3123         /*
3124          * The ARM pseudocode function FPDot performs both multiplies
3125          * and the add with a single rounding operation.  Emulate this
3126          * by performing the first multiply in round-to-odd, then doing
3127          * the second multiply as fused multiply-add, and rounding to
3128          * float32 all in one step.
3129          */
3130         t64 = float64_mul(e1r, e2r, fpst_odd);
3131         t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst);
3132 
3133         /* This conversion is exact, because we've already rounded. */
3134         t32 = float64_to_float32(t64, fpst);
3135     }
3136 
3137     /* The final accumulation step is not fused. */
3138     return float32_add(sum, t32, fpst);
3139 }
3140 
3141 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va,
3142                         CPUARMState *env, uint32_t desc)
3143 {
3144     intptr_t i, opr_sz = simd_oprsz(desc);
3145     float32 *d = vd, *a = va;
3146     uint32_t *n = vn, *m = vm;
3147     float_status fpst, fpst_odd;
3148 
3149     if (is_ebf(env, &fpst, &fpst_odd)) {
3150         for (i = 0; i < opr_sz / 4; ++i) {
3151             d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd);
3152         }
3153     } else {
3154         for (i = 0; i < opr_sz / 4; ++i) {
3155             d[i] = bfdotadd(a[i], n[i], m[i], &fpst);
3156         }
3157     }
3158     clear_tail(d, opr_sz, simd_maxsz(desc));
3159 }
3160 
3161 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
3162                             void *va, CPUARMState *env, uint32_t desc)
3163 {
3164     intptr_t i, j, opr_sz = simd_oprsz(desc);
3165     intptr_t index = simd_data(desc);
3166     intptr_t elements = opr_sz / 4;
3167     intptr_t eltspersegment = MIN(16 / 4, elements);
3168     float32 *d = vd, *a = va;
3169     uint32_t *n = vn, *m = vm;
3170     float_status fpst, fpst_odd;
3171 
3172     if (is_ebf(env, &fpst, &fpst_odd)) {
3173         for (i = 0; i < elements; i += eltspersegment) {
3174             uint32_t m_idx = m[i + H4(index)];
3175 
3176             for (j = i; j < i + eltspersegment; j++) {
3177                 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd);
3178             }
3179         }
3180     } else {
3181         for (i = 0; i < elements; i += eltspersegment) {
3182             uint32_t m_idx = m[i + H4(index)];
3183 
3184             for (j = i; j < i + eltspersegment; j++) {
3185                 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst);
3186             }
3187         }
3188     }
3189     clear_tail(d, opr_sz, simd_maxsz(desc));
3190 }
3191 
3192 void HELPER(sme2_bfvdot_idx)(void *vd, void *vn, void *vm,
3193                              void *va, CPUARMState *env, uint32_t desc)
3194 {
3195     intptr_t i, j, opr_sz = simd_oprsz(desc);
3196     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2);
3197     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
3198     intptr_t elements = opr_sz / 4;
3199     intptr_t eltspersegment = MIN(16 / 4, elements);
3200     float32 *d = vd, *a = va;
3201     uint16_t *n0 = vn;
3202     uint16_t *n1 = vn + sizeof(ARMVectorReg);
3203     uint32_t *m = vm;
3204     float_status fpst, fpst_odd;
3205 
3206     if (is_ebf(env, &fpst, &fpst_odd)) {
3207         for (i = 0; i < elements; i += eltspersegment) {
3208             uint32_t m_idx = m[i + H4(idx)];
3209 
3210             for (j = 0; j < eltspersegment; j++) {
3211                 uint32_t nn = (n0[H2(2 * (i + j) + sel)])
3212                             | (n1[H2(2 * (i + j) + sel)] << 16);
3213                 d[i + H4(j)] = bfdotadd_ebf(a[i + H4(j)], nn, m_idx,
3214                                             &fpst, &fpst_odd);
3215             }
3216         }
3217     } else {
3218         for (i = 0; i < elements; i += eltspersegment) {
3219             uint32_t m_idx = m[i + H4(idx)];
3220 
3221             for (j = 0; j < eltspersegment; j++) {
3222                 uint32_t nn = (n0[H2(2 * (i + j) + sel)])
3223                             | (n1[H2(2 * (i + j) + sel)] << 16);
3224                 d[i + H4(j)] = bfdotadd(a[i + H4(j)], nn, m_idx, &fpst);
3225             }
3226         }
3227     }
3228     clear_tail(d, opr_sz, simd_maxsz(desc));
3229 }
3230 
3231 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va,
3232                          CPUARMState *env, uint32_t desc)
3233 {
3234     intptr_t s, opr_sz = simd_oprsz(desc);
3235     float32 *d = vd, *a = va;
3236     uint32_t *n = vn, *m = vm;
3237     float_status fpst, fpst_odd;
3238 
3239     if (is_ebf(env, &fpst, &fpst_odd)) {
3240         for (s = 0; s < opr_sz / 4; s += 4) {
3241             float32 sum00, sum01, sum10, sum11;
3242 
3243             /*
3244              * Process the entire segment at once, writing back the
3245              * results only after we've consumed all of the inputs.
3246              *
3247              * Key to indices by column:
3248              *               i   j               i   k             j   k
3249              */
3250             sum00 = a[s + H4(0 + 0)];
3251             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3252             sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3253 
3254             sum01 = a[s + H4(0 + 1)];
3255             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3256             sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3257 
3258             sum10 = a[s + H4(2 + 0)];
3259             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd);
3260             sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd);
3261 
3262             sum11 = a[s + H4(2 + 1)];
3263             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd);
3264             sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd);
3265 
3266             d[s + H4(0 + 0)] = sum00;
3267             d[s + H4(0 + 1)] = sum01;
3268             d[s + H4(2 + 0)] = sum10;
3269             d[s + H4(2 + 1)] = sum11;
3270         }
3271     } else {
3272         for (s = 0; s < opr_sz / 4; s += 4) {
3273             float32 sum00, sum01, sum10, sum11;
3274 
3275             /*
3276              * Process the entire segment at once, writing back the
3277              * results only after we've consumed all of the inputs.
3278              *
3279              * Key to indices by column:
3280              *               i   j           i   k             j   k
3281              */
3282             sum00 = a[s + H4(0 + 0)];
3283             sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst);
3284             sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst);
3285 
3286             sum01 = a[s + H4(0 + 1)];
3287             sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst);
3288             sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst);
3289 
3290             sum10 = a[s + H4(2 + 0)];
3291             sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst);
3292             sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst);
3293 
3294             sum11 = a[s + H4(2 + 1)];
3295             sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst);
3296             sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst);
3297 
3298             d[s + H4(0 + 0)] = sum00;
3299             d[s + H4(0 + 1)] = sum01;
3300             d[s + H4(2 + 0)] = sum10;
3301             d[s + H4(2 + 1)] = sum11;
3302         }
3303     }
3304     clear_tail(d, opr_sz, simd_maxsz(desc));
3305 }
3306 
3307 static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
3308                       float_status *stat, uint32_t desc, int negx, int negf)
3309 {
3310     intptr_t i, opr_sz = simd_oprsz(desc);
3311     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3312 
3313     for (i = 0; i < opr_sz / 4; ++i) {
3314         float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16;
3315         float32 mm = m[H2(i * 2 + sel)] << 16;
3316         d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat);
3317     }
3318     clear_tail(d, opr_sz, simd_maxsz(desc));
3319 }
3320 
3321 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
3322                          float_status *stat, uint32_t desc)
3323 {
3324     do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0);
3325 }
3326 
3327 void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va,
3328                          float_status *stat, uint32_t desc)
3329 {
3330     do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0);
3331 }
3332 
3333 void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va,
3334                             float_status *stat, uint32_t desc)
3335 {
3336     do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
3337 }
3338 
3339 static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
3340                           float_status *stat, uint32_t desc, int negx, int negf)
3341 {
3342     intptr_t i, j, opr_sz = simd_oprsz(desc);
3343     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
3344     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
3345     intptr_t elements = opr_sz / 4;
3346     intptr_t eltspersegment = MIN(16 / 4, elements);
3347 
3348     for (i = 0; i < elements; i += eltspersegment) {
3349         float32 m_idx = m[H2(2 * i + index)] << 16;
3350 
3351         for (j = i; j < i + eltspersegment; j++) {
3352             float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16;
3353             d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat);
3354         }
3355     }
3356     clear_tail(d, opr_sz, simd_maxsz(desc));
3357 }
3358 
3359 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va,
3360                              float_status *stat, uint32_t desc)
3361 {
3362     do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0);
3363 }
3364 
3365 void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
3366                              float_status *stat, uint32_t desc)
3367 {
3368     do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0);
3369 }
3370 
3371 void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
3372                                 float_status *stat, uint32_t desc)
3373 {
3374     do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
3375 }
3376 
3377 #define DO_CLAMP(NAME, TYPE) \
3378 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc)    \
3379 {                                                                       \
3380     intptr_t i, opr_sz = simd_oprsz(desc);                              \
3381     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
3382         TYPE aa = *(TYPE *)(a + i);                                     \
3383         TYPE nn = *(TYPE *)(n + i);                                     \
3384         TYPE mm = *(TYPE *)(m + i);                                     \
3385         TYPE dd = MIN(MAX(aa, nn), mm);                                 \
3386         *(TYPE *)(d + i) = dd;                                          \
3387     }                                                                   \
3388     clear_tail(d, opr_sz, simd_maxsz(desc));                            \
3389 }
3390 
3391 DO_CLAMP(gvec_sclamp_b, int8_t)
3392 DO_CLAMP(gvec_sclamp_h, int16_t)
3393 DO_CLAMP(gvec_sclamp_s, int32_t)
3394 DO_CLAMP(gvec_sclamp_d, int64_t)
3395 
3396 DO_CLAMP(gvec_uclamp_b, uint8_t)
3397 DO_CLAMP(gvec_uclamp_h, uint16_t)
3398 DO_CLAMP(gvec_uclamp_s, uint32_t)
3399 DO_CLAMP(gvec_uclamp_d, uint64_t)
3400 
3401 /* Bit count in each 8-bit word. */
3402 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc)
3403 {
3404     intptr_t i, opr_sz = simd_oprsz(desc);
3405     uint8_t *d = vd, *n = vn;
3406 
3407     for (i = 0; i < opr_sz; ++i) {
3408         d[i] = ctpop8(n[i]);
3409     }
3410     clear_tail(d, opr_sz, simd_maxsz(desc));
3411 }
3412 
3413 /* Reverse bits in each 8 bit word */
3414 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc)
3415 {
3416     intptr_t i, opr_sz = simd_oprsz(desc);
3417     uint64_t *d = vd, *n = vn;
3418 
3419     for (i = 0; i < opr_sz / 8; ++i) {
3420         d[i] = revbit64(bswap64(n[i]));
3421     }
3422     clear_tail(d, opr_sz, simd_maxsz(desc));
3423 }
3424 
3425 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc)
3426 {
3427     intptr_t i, opr_sz = simd_oprsz(desc);
3428     uint32_t *d = vd, *n = vn;
3429 
3430     for (i = 0; i < opr_sz / 4; ++i) {
3431         d[i] = helper_recpe_u32(n[i]);
3432     }
3433     clear_tail(d, opr_sz, simd_maxsz(desc));
3434 }
3435 
3436 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc)
3437 {
3438     intptr_t i, opr_sz = simd_oprsz(desc);
3439     uint32_t *d = vd, *n = vn;
3440 
3441     for (i = 0; i < opr_sz / 4; ++i) {
3442         d[i] = helper_rsqrte_u32(n[i]);
3443     }
3444     clear_tail(d, opr_sz, simd_maxsz(desc));
3445 }
3446 
3447 static inline void do_lut_b(void *zd, uint64_t *indexes, uint64_t *table,
3448                             unsigned elements, unsigned segbase,
3449                             unsigned dstride, unsigned isize,
3450                             unsigned tsize, unsigned nreg)
3451 {
3452     for (unsigned r = 0; r < nreg; ++r) {
3453         uint8_t *dst = zd + dstride * r;
3454         unsigned base = segbase + r * elements;
3455 
3456         for (unsigned e = 0; e < elements; ++e) {
3457             unsigned index = extractn(indexes, (base + e) * isize, isize);
3458             dst[H1(e)] = extractn(table, index * tsize, 8);
3459         }
3460     }
3461 }
3462 
3463 static inline void do_lut_h(void *zd, uint64_t *indexes, uint64_t *table,
3464                             unsigned elements, unsigned segbase,
3465                             unsigned dstride, unsigned isize,
3466                             unsigned tsize, unsigned nreg)
3467 {
3468     for (unsigned r = 0; r < nreg; ++r) {
3469         uint16_t *dst = zd + dstride * r;
3470         unsigned base = segbase + r * elements;
3471 
3472         for (unsigned e = 0; e < elements; ++e) {
3473             unsigned index = extractn(indexes, (base + e) * isize, isize);
3474             dst[H2(e)] = extractn(table, index * tsize, 16);
3475         }
3476     }
3477 }
3478 
3479 static inline void do_lut_s(void *zd, uint64_t *indexes, uint32_t *table,
3480                             unsigned elements, unsigned segbase,
3481                             unsigned dstride, unsigned isize,
3482                             unsigned tsize, unsigned nreg)
3483 {
3484     for (unsigned r = 0; r < nreg; ++r) {
3485         uint32_t *dst = zd + dstride * r;
3486         unsigned base = segbase + r * elements;
3487 
3488         for (unsigned e = 0; e < elements; ++e) {
3489             unsigned index = extractn(indexes, (base + e) * isize, isize);
3490             dst[H4(e)] = table[H4(index)];
3491         }
3492     }
3493 }
3494 
3495 #define DO_SME2_LUT(ISIZE, NREG, SUFF, ESIZE) \
3496 void helper_sme2_luti##ISIZE##_##NREG##SUFF                             \
3497     (void *zd, void *zn, CPUARMState *env, uint32_t desc)               \
3498 {                                                                       \
3499     unsigned vl = simd_oprsz(desc);                                     \
3500     unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1);             \
3501     unsigned idx = extract32(desc, SIMD_DATA_SHIFT + 1, 4);             \
3502     unsigned elements = vl / ESIZE;                                     \
3503     unsigned dstride = (!strided ? 1 : NREG == 4 ? 4 : 8);              \
3504     unsigned segments = (ESIZE * 8) / (ISIZE * NREG);                   \
3505     unsigned segment = idx & (segments - 1);                            \
3506     ARMVectorReg indexes;                                               \
3507     memcpy(&indexes, zn, vl);                                           \
3508     do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements,   \
3509                   segment * NREG * elements,                            \
3510                   dstride * sizeof(ARMVectorReg), ISIZE, 32, NREG);     \
3511 }
3512 
3513 DO_SME2_LUT(2,1,b, 1)
3514 DO_SME2_LUT(2,1,h, 2)
3515 DO_SME2_LUT(2,1,s, 4)
3516 DO_SME2_LUT(2,2,b, 1)
3517 DO_SME2_LUT(2,2,h, 2)
3518 DO_SME2_LUT(2,2,s, 4)
3519 DO_SME2_LUT(2,4,b, 1)
3520 DO_SME2_LUT(2,4,h, 2)
3521 DO_SME2_LUT(2,4,s, 4)
3522 
3523 DO_SME2_LUT(4,1,b, 1)
3524 DO_SME2_LUT(4,1,h, 2)
3525 DO_SME2_LUT(4,1,s, 4)
3526 DO_SME2_LUT(4,2,b, 1)
3527 DO_SME2_LUT(4,2,h, 2)
3528 DO_SME2_LUT(4,2,s, 4)
3529 DO_SME2_LUT(4,4,b, 1)
3530 DO_SME2_LUT(4,4,h, 2)
3531 DO_SME2_LUT(4,4,s, 4)
3532 
3533 #undef DO_SME2_LUT
3534