xref: /qemu/target/arm/tcg/vec_helper.c (revision 3607440c4df6498585a570cfc1041e4972b41b56)
1 /*
2  * ARM AdvSIMD / SVE Vector Operations
3  *
4  * Copyright (c) 2018 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "tcg/tcg-gvec-desc.h"
24 #include "fpu/softfloat.h"
25 #include "vec_internal.h"
26 
27 /* Note that vector data is stored in host-endian 64-bit chunks,
28    so addressing units smaller than that needs a host-endian fixup.  */
29 #ifdef HOST_WORDS_BIGENDIAN
30 #define H1(x)  ((x) ^ 7)
31 #define H2(x)  ((x) ^ 3)
32 #define H4(x)  ((x) ^ 1)
33 #else
34 #define H1(x)  (x)
35 #define H2(x)  (x)
36 #define H4(x)  (x)
37 #endif
38 
39 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
40 static int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
41                              bool neg, bool round, uint32_t *sat)
42 {
43     /*
44      * Simplify:
45      * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16
46      * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15
47      */
48     int32_t ret = (int32_t)src1 * src2;
49     if (neg) {
50         ret = -ret;
51     }
52     ret += ((int32_t)src3 << 15) + (round << 14);
53     ret >>= 15;
54 
55     if (ret != (int16_t)ret) {
56         *sat = 1;
57         ret = (ret < 0 ? INT16_MIN : INT16_MAX);
58     }
59     return ret;
60 }
61 
62 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
63                                   uint32_t src2, uint32_t src3)
64 {
65     uint32_t *sat = &env->vfp.qc[0];
66     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
67     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
68                                 false, true, sat);
69     return deposit32(e1, 16, 16, e2);
70 }
71 
72 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
73                               void *vq, uint32_t desc)
74 {
75     uintptr_t opr_sz = simd_oprsz(desc);
76     int16_t *d = vd;
77     int16_t *n = vn;
78     int16_t *m = vm;
79     uintptr_t i;
80 
81     for (i = 0; i < opr_sz / 2; ++i) {
82         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
83     }
84     clear_tail(d, opr_sz, simd_maxsz(desc));
85 }
86 
87 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
88                                   uint32_t src2, uint32_t src3)
89 {
90     uint32_t *sat = &env->vfp.qc[0];
91     uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
92     uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
93                                 true, true, sat);
94     return deposit32(e1, 16, 16, e2);
95 }
96 
97 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
98                               void *vq, uint32_t desc)
99 {
100     uintptr_t opr_sz = simd_oprsz(desc);
101     int16_t *d = vd;
102     int16_t *n = vn;
103     int16_t *m = vm;
104     uintptr_t i;
105 
106     for (i = 0; i < opr_sz / 2; ++i) {
107         d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
108     }
109     clear_tail(d, opr_sz, simd_maxsz(desc));
110 }
111 
112 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
113 static int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
114                              bool neg, bool round, uint32_t *sat)
115 {
116     /* Simplify similarly to int_qrdmlah_s16 above.  */
117     int64_t ret = (int64_t)src1 * src2;
118     if (neg) {
119         ret = -ret;
120     }
121     ret += ((int64_t)src3 << 31) + (round << 30);
122     ret >>= 31;
123 
124     if (ret != (int32_t)ret) {
125         *sat = 1;
126         ret = (ret < 0 ? INT32_MIN : INT32_MAX);
127     }
128     return ret;
129 }
130 
131 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
132                                   int32_t src2, int32_t src3)
133 {
134     uint32_t *sat = &env->vfp.qc[0];
135     return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
136 }
137 
138 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
139                               void *vq, uint32_t desc)
140 {
141     uintptr_t opr_sz = simd_oprsz(desc);
142     int32_t *d = vd;
143     int32_t *n = vn;
144     int32_t *m = vm;
145     uintptr_t i;
146 
147     for (i = 0; i < opr_sz / 4; ++i) {
148         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
149     }
150     clear_tail(d, opr_sz, simd_maxsz(desc));
151 }
152 
153 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
154                                   int32_t src2, int32_t src3)
155 {
156     uint32_t *sat = &env->vfp.qc[0];
157     return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
158 }
159 
160 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
161                               void *vq, uint32_t desc)
162 {
163     uintptr_t opr_sz = simd_oprsz(desc);
164     int32_t *d = vd;
165     int32_t *n = vn;
166     int32_t *m = vm;
167     uintptr_t i;
168 
169     for (i = 0; i < opr_sz / 4; ++i) {
170         d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
171     }
172     clear_tail(d, opr_sz, simd_maxsz(desc));
173 }
174 
175 /* Integer 8 and 16-bit dot-product.
176  *
177  * Note that for the loops herein, host endianness does not matter
178  * with respect to the ordering of data within the 64-bit lanes.
179  * All elements are treated equally, no matter where they are.
180  */
181 
182 void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc)
183 {
184     intptr_t i, opr_sz = simd_oprsz(desc);
185     uint32_t *d = vd;
186     int8_t *n = vn, *m = vm;
187 
188     for (i = 0; i < opr_sz / 4; ++i) {
189         d[i] += n[i * 4 + 0] * m[i * 4 + 0]
190               + n[i * 4 + 1] * m[i * 4 + 1]
191               + n[i * 4 + 2] * m[i * 4 + 2]
192               + n[i * 4 + 3] * m[i * 4 + 3];
193     }
194     clear_tail(d, opr_sz, simd_maxsz(desc));
195 }
196 
197 void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc)
198 {
199     intptr_t i, opr_sz = simd_oprsz(desc);
200     uint32_t *d = vd;
201     uint8_t *n = vn, *m = vm;
202 
203     for (i = 0; i < opr_sz / 4; ++i) {
204         d[i] += n[i * 4 + 0] * m[i * 4 + 0]
205               + n[i * 4 + 1] * m[i * 4 + 1]
206               + n[i * 4 + 2] * m[i * 4 + 2]
207               + n[i * 4 + 3] * m[i * 4 + 3];
208     }
209     clear_tail(d, opr_sz, simd_maxsz(desc));
210 }
211 
212 void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc)
213 {
214     intptr_t i, opr_sz = simd_oprsz(desc);
215     uint64_t *d = vd;
216     int16_t *n = vn, *m = vm;
217 
218     for (i = 0; i < opr_sz / 8; ++i) {
219         d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0]
220               + (int64_t)n[i * 4 + 1] * m[i * 4 + 1]
221               + (int64_t)n[i * 4 + 2] * m[i * 4 + 2]
222               + (int64_t)n[i * 4 + 3] * m[i * 4 + 3];
223     }
224     clear_tail(d, opr_sz, simd_maxsz(desc));
225 }
226 
227 void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc)
228 {
229     intptr_t i, opr_sz = simd_oprsz(desc);
230     uint64_t *d = vd;
231     uint16_t *n = vn, *m = vm;
232 
233     for (i = 0; i < opr_sz / 8; ++i) {
234         d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0]
235               + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1]
236               + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2]
237               + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3];
238     }
239     clear_tail(d, opr_sz, simd_maxsz(desc));
240 }
241 
242 void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
243 {
244     intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
245     intptr_t index = simd_data(desc);
246     uint32_t *d = vd;
247     int8_t *n = vn;
248     int8_t *m_indexed = (int8_t *)vm + index * 4;
249 
250     /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
251      * Otherwise opr_sz is a multiple of 16.
252      */
253     segend = MIN(4, opr_sz_4);
254     i = 0;
255     do {
256         int8_t m0 = m_indexed[i * 4 + 0];
257         int8_t m1 = m_indexed[i * 4 + 1];
258         int8_t m2 = m_indexed[i * 4 + 2];
259         int8_t m3 = m_indexed[i * 4 + 3];
260 
261         do {
262             d[i] += n[i * 4 + 0] * m0
263                   + n[i * 4 + 1] * m1
264                   + n[i * 4 + 2] * m2
265                   + n[i * 4 + 3] * m3;
266         } while (++i < segend);
267         segend = i + 4;
268     } while (i < opr_sz_4);
269 
270     clear_tail(d, opr_sz, simd_maxsz(desc));
271 }
272 
273 void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
274 {
275     intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
276     intptr_t index = simd_data(desc);
277     uint32_t *d = vd;
278     uint8_t *n = vn;
279     uint8_t *m_indexed = (uint8_t *)vm + index * 4;
280 
281     /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
282      * Otherwise opr_sz is a multiple of 16.
283      */
284     segend = MIN(4, opr_sz_4);
285     i = 0;
286     do {
287         uint8_t m0 = m_indexed[i * 4 + 0];
288         uint8_t m1 = m_indexed[i * 4 + 1];
289         uint8_t m2 = m_indexed[i * 4 + 2];
290         uint8_t m3 = m_indexed[i * 4 + 3];
291 
292         do {
293             d[i] += n[i * 4 + 0] * m0
294                   + n[i * 4 + 1] * m1
295                   + n[i * 4 + 2] * m2
296                   + n[i * 4 + 3] * m3;
297         } while (++i < segend);
298         segend = i + 4;
299     } while (i < opr_sz_4);
300 
301     clear_tail(d, opr_sz, simd_maxsz(desc));
302 }
303 
304 void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
305 {
306     intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
307     intptr_t index = simd_data(desc);
308     uint64_t *d = vd;
309     int16_t *n = vn;
310     int16_t *m_indexed = (int16_t *)vm + index * 4;
311 
312     /* This is supported by SVE only, so opr_sz is always a multiple of 16.
313      * Process the entire segment all at once, writing back the results
314      * only after we've consumed all of the inputs.
315      */
316     for (i = 0; i < opr_sz_8 ; i += 2) {
317         uint64_t d0, d1;
318 
319         d0  = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0];
320         d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1];
321         d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2];
322         d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3];
323         d1  = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0];
324         d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1];
325         d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2];
326         d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3];
327 
328         d[i + 0] += d0;
329         d[i + 1] += d1;
330     }
331 
332     clear_tail(d, opr_sz, simd_maxsz(desc));
333 }
334 
335 void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
336 {
337     intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
338     intptr_t index = simd_data(desc);
339     uint64_t *d = vd;
340     uint16_t *n = vn;
341     uint16_t *m_indexed = (uint16_t *)vm + index * 4;
342 
343     /* This is supported by SVE only, so opr_sz is always a multiple of 16.
344      * Process the entire segment all at once, writing back the results
345      * only after we've consumed all of the inputs.
346      */
347     for (i = 0; i < opr_sz_8 ; i += 2) {
348         uint64_t d0, d1;
349 
350         d0  = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0];
351         d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1];
352         d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2];
353         d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3];
354         d1  = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0];
355         d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1];
356         d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2];
357         d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3];
358 
359         d[i + 0] += d0;
360         d[i + 1] += d1;
361     }
362 
363     clear_tail(d, opr_sz, simd_maxsz(desc));
364 }
365 
366 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
367                          void *vfpst, uint32_t desc)
368 {
369     uintptr_t opr_sz = simd_oprsz(desc);
370     float16 *d = vd;
371     float16 *n = vn;
372     float16 *m = vm;
373     float_status *fpst = vfpst;
374     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
375     uint32_t neg_imag = neg_real ^ 1;
376     uintptr_t i;
377 
378     /* Shift boolean to the sign bit so we can xor to negate.  */
379     neg_real <<= 15;
380     neg_imag <<= 15;
381 
382     for (i = 0; i < opr_sz / 2; i += 2) {
383         float16 e0 = n[H2(i)];
384         float16 e1 = m[H2(i + 1)] ^ neg_imag;
385         float16 e2 = n[H2(i + 1)];
386         float16 e3 = m[H2(i)] ^ neg_real;
387 
388         d[H2(i)] = float16_add(e0, e1, fpst);
389         d[H2(i + 1)] = float16_add(e2, e3, fpst);
390     }
391     clear_tail(d, opr_sz, simd_maxsz(desc));
392 }
393 
394 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
395                          void *vfpst, uint32_t desc)
396 {
397     uintptr_t opr_sz = simd_oprsz(desc);
398     float32 *d = vd;
399     float32 *n = vn;
400     float32 *m = vm;
401     float_status *fpst = vfpst;
402     uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
403     uint32_t neg_imag = neg_real ^ 1;
404     uintptr_t i;
405 
406     /* Shift boolean to the sign bit so we can xor to negate.  */
407     neg_real <<= 31;
408     neg_imag <<= 31;
409 
410     for (i = 0; i < opr_sz / 4; i += 2) {
411         float32 e0 = n[H4(i)];
412         float32 e1 = m[H4(i + 1)] ^ neg_imag;
413         float32 e2 = n[H4(i + 1)];
414         float32 e3 = m[H4(i)] ^ neg_real;
415 
416         d[H4(i)] = float32_add(e0, e1, fpst);
417         d[H4(i + 1)] = float32_add(e2, e3, fpst);
418     }
419     clear_tail(d, opr_sz, simd_maxsz(desc));
420 }
421 
422 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
423                          void *vfpst, uint32_t desc)
424 {
425     uintptr_t opr_sz = simd_oprsz(desc);
426     float64 *d = vd;
427     float64 *n = vn;
428     float64 *m = vm;
429     float_status *fpst = vfpst;
430     uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
431     uint64_t neg_imag = neg_real ^ 1;
432     uintptr_t i;
433 
434     /* Shift boolean to the sign bit so we can xor to negate.  */
435     neg_real <<= 63;
436     neg_imag <<= 63;
437 
438     for (i = 0; i < opr_sz / 8; i += 2) {
439         float64 e0 = n[i];
440         float64 e1 = m[i + 1] ^ neg_imag;
441         float64 e2 = n[i + 1];
442         float64 e3 = m[i] ^ neg_real;
443 
444         d[i] = float64_add(e0, e1, fpst);
445         d[i + 1] = float64_add(e2, e3, fpst);
446     }
447     clear_tail(d, opr_sz, simd_maxsz(desc));
448 }
449 
450 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm,
451                          void *vfpst, uint32_t desc)
452 {
453     uintptr_t opr_sz = simd_oprsz(desc);
454     float16 *d = vd;
455     float16 *n = vn;
456     float16 *m = vm;
457     float_status *fpst = vfpst;
458     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
459     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
460     uint32_t neg_real = flip ^ neg_imag;
461     uintptr_t i;
462 
463     /* Shift boolean to the sign bit so we can xor to negate.  */
464     neg_real <<= 15;
465     neg_imag <<= 15;
466 
467     for (i = 0; i < opr_sz / 2; i += 2) {
468         float16 e2 = n[H2(i + flip)];
469         float16 e1 = m[H2(i + flip)] ^ neg_real;
470         float16 e4 = e2;
471         float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
472 
473         d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst);
474         d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst);
475     }
476     clear_tail(d, opr_sz, simd_maxsz(desc));
477 }
478 
479 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm,
480                              void *vfpst, uint32_t desc)
481 {
482     uintptr_t opr_sz = simd_oprsz(desc);
483     float16 *d = vd;
484     float16 *n = vn;
485     float16 *m = vm;
486     float_status *fpst = vfpst;
487     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
488     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
489     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
490     uint32_t neg_real = flip ^ neg_imag;
491     intptr_t elements = opr_sz / sizeof(float16);
492     intptr_t eltspersegment = 16 / sizeof(float16);
493     intptr_t i, j;
494 
495     /* Shift boolean to the sign bit so we can xor to negate.  */
496     neg_real <<= 15;
497     neg_imag <<= 15;
498 
499     for (i = 0; i < elements; i += eltspersegment) {
500         float16 mr = m[H2(i + 2 * index + 0)];
501         float16 mi = m[H2(i + 2 * index + 1)];
502         float16 e1 = neg_real ^ (flip ? mi : mr);
503         float16 e3 = neg_imag ^ (flip ? mr : mi);
504 
505         for (j = i; j < i + eltspersegment; j += 2) {
506             float16 e2 = n[H2(j + flip)];
507             float16 e4 = e2;
508 
509             d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst);
510             d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst);
511         }
512     }
513     clear_tail(d, opr_sz, simd_maxsz(desc));
514 }
515 
516 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm,
517                          void *vfpst, uint32_t desc)
518 {
519     uintptr_t opr_sz = simd_oprsz(desc);
520     float32 *d = vd;
521     float32 *n = vn;
522     float32 *m = vm;
523     float_status *fpst = vfpst;
524     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
525     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
526     uint32_t neg_real = flip ^ neg_imag;
527     uintptr_t i;
528 
529     /* Shift boolean to the sign bit so we can xor to negate.  */
530     neg_real <<= 31;
531     neg_imag <<= 31;
532 
533     for (i = 0; i < opr_sz / 4; i += 2) {
534         float32 e2 = n[H4(i + flip)];
535         float32 e1 = m[H4(i + flip)] ^ neg_real;
536         float32 e4 = e2;
537         float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
538 
539         d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst);
540         d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst);
541     }
542     clear_tail(d, opr_sz, simd_maxsz(desc));
543 }
544 
545 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm,
546                              void *vfpst, uint32_t desc)
547 {
548     uintptr_t opr_sz = simd_oprsz(desc);
549     float32 *d = vd;
550     float32 *n = vn;
551     float32 *m = vm;
552     float_status *fpst = vfpst;
553     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
554     uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
555     intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
556     uint32_t neg_real = flip ^ neg_imag;
557     intptr_t elements = opr_sz / sizeof(float32);
558     intptr_t eltspersegment = 16 / sizeof(float32);
559     intptr_t i, j;
560 
561     /* Shift boolean to the sign bit so we can xor to negate.  */
562     neg_real <<= 31;
563     neg_imag <<= 31;
564 
565     for (i = 0; i < elements; i += eltspersegment) {
566         float32 mr = m[H4(i + 2 * index + 0)];
567         float32 mi = m[H4(i + 2 * index + 1)];
568         float32 e1 = neg_real ^ (flip ? mi : mr);
569         float32 e3 = neg_imag ^ (flip ? mr : mi);
570 
571         for (j = i; j < i + eltspersegment; j += 2) {
572             float32 e2 = n[H4(j + flip)];
573             float32 e4 = e2;
574 
575             d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst);
576             d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst);
577         }
578     }
579     clear_tail(d, opr_sz, simd_maxsz(desc));
580 }
581 
582 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm,
583                          void *vfpst, uint32_t desc)
584 {
585     uintptr_t opr_sz = simd_oprsz(desc);
586     float64 *d = vd;
587     float64 *n = vn;
588     float64 *m = vm;
589     float_status *fpst = vfpst;
590     intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
591     uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
592     uint64_t neg_real = flip ^ neg_imag;
593     uintptr_t i;
594 
595     /* Shift boolean to the sign bit so we can xor to negate.  */
596     neg_real <<= 63;
597     neg_imag <<= 63;
598 
599     for (i = 0; i < opr_sz / 8; i += 2) {
600         float64 e2 = n[i + flip];
601         float64 e1 = m[i + flip] ^ neg_real;
602         float64 e4 = e2;
603         float64 e3 = m[i + 1 - flip] ^ neg_imag;
604 
605         d[i] = float64_muladd(e2, e1, d[i], 0, fpst);
606         d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst);
607     }
608     clear_tail(d, opr_sz, simd_maxsz(desc));
609 }
610 
611 #define DO_2OP(NAME, FUNC, TYPE) \
612 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc)  \
613 {                                                                 \
614     intptr_t i, oprsz = simd_oprsz(desc);                         \
615     TYPE *d = vd, *n = vn;                                        \
616     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                  \
617         d[i] = FUNC(n[i], stat);                                  \
618     }                                                             \
619     clear_tail(d, oprsz, simd_maxsz(desc));                       \
620 }
621 
622 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
623 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
624 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
625 
626 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
627 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
628 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
629 
630 #undef DO_2OP
631 
632 /* Floating-point trigonometric starting value.
633  * See the ARM ARM pseudocode function FPTrigSMul.
634  */
635 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
636 {
637     float16 result = float16_mul(op1, op1, stat);
638     if (!float16_is_any_nan(result)) {
639         result = float16_set_sign(result, op2 & 1);
640     }
641     return result;
642 }
643 
644 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
645 {
646     float32 result = float32_mul(op1, op1, stat);
647     if (!float32_is_any_nan(result)) {
648         result = float32_set_sign(result, op2 & 1);
649     }
650     return result;
651 }
652 
653 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
654 {
655     float64 result = float64_mul(op1, op1, stat);
656     if (!float64_is_any_nan(result)) {
657         result = float64_set_sign(result, op2 & 1);
658     }
659     return result;
660 }
661 
662 static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
663 {
664     return float32_abs(float32_sub(op1, op2, stat));
665 }
666 
667 #define DO_3OP(NAME, FUNC, TYPE) \
668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
669 {                                                                          \
670     intptr_t i, oprsz = simd_oprsz(desc);                                  \
671     TYPE *d = vd, *n = vn, *m = vm;                                        \
672     for (i = 0; i < oprsz / sizeof(TYPE); i++) {                           \
673         d[i] = FUNC(n[i], m[i], stat);                                     \
674     }                                                                      \
675     clear_tail(d, oprsz, simd_maxsz(desc));                                \
676 }
677 
678 DO_3OP(gvec_fadd_h, float16_add, float16)
679 DO_3OP(gvec_fadd_s, float32_add, float32)
680 DO_3OP(gvec_fadd_d, float64_add, float64)
681 
682 DO_3OP(gvec_fsub_h, float16_sub, float16)
683 DO_3OP(gvec_fsub_s, float32_sub, float32)
684 DO_3OP(gvec_fsub_d, float64_sub, float64)
685 
686 DO_3OP(gvec_fmul_h, float16_mul, float16)
687 DO_3OP(gvec_fmul_s, float32_mul, float32)
688 DO_3OP(gvec_fmul_d, float64_mul, float64)
689 
690 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
691 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
692 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
693 
694 DO_3OP(gvec_fabd_s, float32_abd, float32)
695 
696 #ifdef TARGET_AARCH64
697 
698 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
699 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
700 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
701 
702 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
703 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
704 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
705 
706 #endif
707 #undef DO_3OP
708 
709 /* For the indexed ops, SVE applies the index per 128-bit vector segment.
710  * For AdvSIMD, there is of course only one such vector segment.
711  */
712 
713 #define DO_MUL_IDX(NAME, TYPE, H) \
714 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
715 {                                                                          \
716     intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);  \
717     intptr_t idx = simd_data(desc);                                        \
718     TYPE *d = vd, *n = vn, *m = vm;                                        \
719     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
720         TYPE mm = m[H(i + idx)];                                           \
721         for (j = 0; j < segment; j++) {                                    \
722             d[i + j] = n[i + j] * mm;                                      \
723         }                                                                  \
724     }                                                                      \
725     clear_tail(d, oprsz, simd_maxsz(desc));                                \
726 }
727 
728 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
729 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
730 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, )
731 
732 #undef DO_MUL_IDX
733 
734 #define DO_MLA_IDX(NAME, TYPE, OP, H) \
735 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)   \
736 {                                                                          \
737     intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);  \
738     intptr_t idx = simd_data(desc);                                        \
739     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
740     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
741         TYPE mm = m[H(i + idx)];                                           \
742         for (j = 0; j < segment; j++) {                                    \
743             d[i + j] = a[i + j] OP n[i + j] * mm;                          \
744         }                                                                  \
745     }                                                                      \
746     clear_tail(d, oprsz, simd_maxsz(desc));                                \
747 }
748 
749 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
750 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
751 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +,   )
752 
753 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
754 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
755 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -,   )
756 
757 #undef DO_MLA_IDX
758 
759 #define DO_FMUL_IDX(NAME, TYPE, H) \
760 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
761 {                                                                          \
762     intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);  \
763     intptr_t idx = simd_data(desc);                                        \
764     TYPE *d = vd, *n = vn, *m = vm;                                        \
765     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
766         TYPE mm = m[H(i + idx)];                                           \
767         for (j = 0; j < segment; j++) {                                    \
768             d[i + j] = TYPE##_mul(n[i + j], mm, stat);                     \
769         }                                                                  \
770     }                                                                      \
771     clear_tail(d, oprsz, simd_maxsz(desc));                                \
772 }
773 
774 DO_FMUL_IDX(gvec_fmul_idx_h, float16, H2)
775 DO_FMUL_IDX(gvec_fmul_idx_s, float32, H4)
776 DO_FMUL_IDX(gvec_fmul_idx_d, float64, )
777 
778 #undef DO_FMUL_IDX
779 
780 #define DO_FMLA_IDX(NAME, TYPE, H)                                         \
781 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va,                  \
782                   void *stat, uint32_t desc)                               \
783 {                                                                          \
784     intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);  \
785     TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1);                    \
786     intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1);                          \
787     TYPE *d = vd, *n = vn, *m = vm, *a = va;                               \
788     op1_neg <<= (8 * sizeof(TYPE) - 1);                                    \
789     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {                  \
790         TYPE mm = m[H(i + idx)];                                           \
791         for (j = 0; j < segment; j++) {                                    \
792             d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg,                   \
793                                      mm, a[i + j], 0, stat);               \
794         }                                                                  \
795     }                                                                      \
796     clear_tail(d, oprsz, simd_maxsz(desc));                                \
797 }
798 
799 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
800 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
801 DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
802 
803 #undef DO_FMLA_IDX
804 
805 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
806 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc)   \
807 {                                                                          \
808     intptr_t i, oprsz = simd_oprsz(desc);                                  \
809     TYPEN *d = vd, *n = vn; TYPEM *m = vm;                                 \
810     bool q = false;                                                        \
811     for (i = 0; i < oprsz / sizeof(TYPEN); i++) {                          \
812         WTYPE dd = (WTYPE)n[i] OP m[i];                                    \
813         if (dd < MIN) {                                                    \
814             dd = MIN;                                                      \
815             q = true;                                                      \
816         } else if (dd > MAX) {                                             \
817             dd = MAX;                                                      \
818             q = true;                                                      \
819         }                                                                  \
820         d[i] = dd;                                                         \
821     }                                                                      \
822     if (q) {                                                               \
823         uint32_t *qc = vq;                                                 \
824         qc[0] = 1;                                                         \
825     }                                                                      \
826     clear_tail(d, oprsz, simd_maxsz(desc));                                \
827 }
828 
829 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
830 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
831 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
832 
833 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
834 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
835 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
836 
837 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
838 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
839 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
840 
841 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
842 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
843 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
844 
845 #undef DO_SAT
846 
847 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
848                           void *vm, uint32_t desc)
849 {
850     intptr_t i, oprsz = simd_oprsz(desc);
851     uint64_t *d = vd, *n = vn, *m = vm;
852     bool q = false;
853 
854     for (i = 0; i < oprsz / 8; i++) {
855         uint64_t nn = n[i], mm = m[i], dd = nn + mm;
856         if (dd < nn) {
857             dd = UINT64_MAX;
858             q = true;
859         }
860         d[i] = dd;
861     }
862     if (q) {
863         uint32_t *qc = vq;
864         qc[0] = 1;
865     }
866     clear_tail(d, oprsz, simd_maxsz(desc));
867 }
868 
869 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
870                           void *vm, uint32_t desc)
871 {
872     intptr_t i, oprsz = simd_oprsz(desc);
873     uint64_t *d = vd, *n = vn, *m = vm;
874     bool q = false;
875 
876     for (i = 0; i < oprsz / 8; i++) {
877         uint64_t nn = n[i], mm = m[i], dd = nn - mm;
878         if (nn < mm) {
879             dd = 0;
880             q = true;
881         }
882         d[i] = dd;
883     }
884     if (q) {
885         uint32_t *qc = vq;
886         qc[0] = 1;
887     }
888     clear_tail(d, oprsz, simd_maxsz(desc));
889 }
890 
891 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
892                           void *vm, uint32_t desc)
893 {
894     intptr_t i, oprsz = simd_oprsz(desc);
895     int64_t *d = vd, *n = vn, *m = vm;
896     bool q = false;
897 
898     for (i = 0; i < oprsz / 8; i++) {
899         int64_t nn = n[i], mm = m[i], dd = nn + mm;
900         if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
901             dd = (nn >> 63) ^ ~INT64_MIN;
902             q = true;
903         }
904         d[i] = dd;
905     }
906     if (q) {
907         uint32_t *qc = vq;
908         qc[0] = 1;
909     }
910     clear_tail(d, oprsz, simd_maxsz(desc));
911 }
912 
913 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
914                           void *vm, uint32_t desc)
915 {
916     intptr_t i, oprsz = simd_oprsz(desc);
917     int64_t *d = vd, *n = vn, *m = vm;
918     bool q = false;
919 
920     for (i = 0; i < oprsz / 8; i++) {
921         int64_t nn = n[i], mm = m[i], dd = nn - mm;
922         if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
923             dd = (nn >> 63) ^ ~INT64_MIN;
924             q = true;
925         }
926         d[i] = dd;
927     }
928     if (q) {
929         uint32_t *qc = vq;
930         qc[0] = 1;
931     }
932     clear_tail(d, oprsz, simd_maxsz(desc));
933 }
934 
935 
936 #define DO_SRA(NAME, TYPE)                              \
937 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
938 {                                                       \
939     intptr_t i, oprsz = simd_oprsz(desc);               \
940     int shift = simd_data(desc);                        \
941     TYPE *d = vd, *n = vn;                              \
942     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
943         d[i] += n[i] >> shift;                          \
944     }                                                   \
945     clear_tail(d, oprsz, simd_maxsz(desc));             \
946 }
947 
948 DO_SRA(gvec_ssra_b, int8_t)
949 DO_SRA(gvec_ssra_h, int16_t)
950 DO_SRA(gvec_ssra_s, int32_t)
951 DO_SRA(gvec_ssra_d, int64_t)
952 
953 DO_SRA(gvec_usra_b, uint8_t)
954 DO_SRA(gvec_usra_h, uint16_t)
955 DO_SRA(gvec_usra_s, uint32_t)
956 DO_SRA(gvec_usra_d, uint64_t)
957 
958 #undef DO_SRA
959 
960 #define DO_RSHR(NAME, TYPE)                             \
961 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
962 {                                                       \
963     intptr_t i, oprsz = simd_oprsz(desc);               \
964     int shift = simd_data(desc);                        \
965     TYPE *d = vd, *n = vn;                              \
966     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
967         TYPE tmp = n[i] >> (shift - 1);                 \
968         d[i] = (tmp >> 1) + (tmp & 1);                  \
969     }                                                   \
970     clear_tail(d, oprsz, simd_maxsz(desc));             \
971 }
972 
973 DO_RSHR(gvec_srshr_b, int8_t)
974 DO_RSHR(gvec_srshr_h, int16_t)
975 DO_RSHR(gvec_srshr_s, int32_t)
976 DO_RSHR(gvec_srshr_d, int64_t)
977 
978 DO_RSHR(gvec_urshr_b, uint8_t)
979 DO_RSHR(gvec_urshr_h, uint16_t)
980 DO_RSHR(gvec_urshr_s, uint32_t)
981 DO_RSHR(gvec_urshr_d, uint64_t)
982 
983 #undef DO_RSHR
984 
985 #define DO_RSRA(NAME, TYPE)                             \
986 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
987 {                                                       \
988     intptr_t i, oprsz = simd_oprsz(desc);               \
989     int shift = simd_data(desc);                        \
990     TYPE *d = vd, *n = vn;                              \
991     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
992         TYPE tmp = n[i] >> (shift - 1);                 \
993         d[i] += (tmp >> 1) + (tmp & 1);                 \
994     }                                                   \
995     clear_tail(d, oprsz, simd_maxsz(desc));             \
996 }
997 
998 DO_RSRA(gvec_srsra_b, int8_t)
999 DO_RSRA(gvec_srsra_h, int16_t)
1000 DO_RSRA(gvec_srsra_s, int32_t)
1001 DO_RSRA(gvec_srsra_d, int64_t)
1002 
1003 DO_RSRA(gvec_ursra_b, uint8_t)
1004 DO_RSRA(gvec_ursra_h, uint16_t)
1005 DO_RSRA(gvec_ursra_s, uint32_t)
1006 DO_RSRA(gvec_ursra_d, uint64_t)
1007 
1008 #undef DO_RSRA
1009 
1010 #define DO_SRI(NAME, TYPE)                              \
1011 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1012 {                                                       \
1013     intptr_t i, oprsz = simd_oprsz(desc);               \
1014     int shift = simd_data(desc);                        \
1015     TYPE *d = vd, *n = vn;                              \
1016     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1017         d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1018     }                                                   \
1019     clear_tail(d, oprsz, simd_maxsz(desc));             \
1020 }
1021 
1022 DO_SRI(gvec_sri_b, uint8_t)
1023 DO_SRI(gvec_sri_h, uint16_t)
1024 DO_SRI(gvec_sri_s, uint32_t)
1025 DO_SRI(gvec_sri_d, uint64_t)
1026 
1027 #undef DO_SRI
1028 
1029 #define DO_SLI(NAME, TYPE)                              \
1030 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1031 {                                                       \
1032     intptr_t i, oprsz = simd_oprsz(desc);               \
1033     int shift = simd_data(desc);                        \
1034     TYPE *d = vd, *n = vn;                              \
1035     for (i = 0; i < oprsz / sizeof(TYPE); i++) {        \
1036         d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1037     }                                                   \
1038     clear_tail(d, oprsz, simd_maxsz(desc));             \
1039 }
1040 
1041 DO_SLI(gvec_sli_b, uint8_t)
1042 DO_SLI(gvec_sli_h, uint16_t)
1043 DO_SLI(gvec_sli_s, uint32_t)
1044 DO_SLI(gvec_sli_d, uint64_t)
1045 
1046 #undef DO_SLI
1047 
1048 /*
1049  * Convert float16 to float32, raising no exceptions and
1050  * preserving exceptional values, including SNaN.
1051  * This is effectively an unpack+repack operation.
1052  */
1053 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1054 {
1055     const int f16_bias = 15;
1056     const int f32_bias = 127;
1057     uint32_t sign = extract32(f16, 15, 1);
1058     uint32_t exp = extract32(f16, 10, 5);
1059     uint32_t frac = extract32(f16, 0, 10);
1060 
1061     if (exp == 0x1f) {
1062         /* Inf or NaN */
1063         exp = 0xff;
1064     } else if (exp == 0) {
1065         /* Zero or denormal.  */
1066         if (frac != 0) {
1067             if (fz16) {
1068                 frac = 0;
1069             } else {
1070                 /*
1071                  * Denormal; these are all normal float32.
1072                  * Shift the fraction so that the msb is at bit 11,
1073                  * then remove bit 11 as the implicit bit of the
1074                  * normalized float32.  Note that we still go through
1075                  * the shift for normal numbers below, to put the
1076                  * float32 fraction at the right place.
1077                  */
1078                 int shift = clz32(frac) - 21;
1079                 frac = (frac << shift) & 0x3ff;
1080                 exp = f32_bias - f16_bias - shift + 1;
1081             }
1082         }
1083     } else {
1084         /* Normal number; adjust the bias.  */
1085         exp += f32_bias - f16_bias;
1086     }
1087     sign <<= 31;
1088     exp <<= 23;
1089     frac <<= 23 - 10;
1090 
1091     return sign | exp | frac;
1092 }
1093 
1094 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1095 {
1096     /*
1097      * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1098      * Load the 2nd qword iff is_q & is_2.
1099      * Shift to the 2nd dword iff !is_q & is_2.
1100      * For !is_q & !is_2, the upper bits of the result are garbage.
1101      */
1102     return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1103 }
1104 
1105 /*
1106  * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1107  * as there is not yet SVE versions that might use blocking.
1108  */
1109 
1110 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1111                      uint32_t desc, bool fz16)
1112 {
1113     intptr_t i, oprsz = simd_oprsz(desc);
1114     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1115     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1116     int is_q = oprsz == 16;
1117     uint64_t n_4, m_4;
1118 
1119     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1120     n_4 = load4_f16(vn, is_q, is_2);
1121     m_4 = load4_f16(vm, is_q, is_2);
1122 
1123     /* Negate all inputs for FMLSL at once.  */
1124     if (is_s) {
1125         n_4 ^= 0x8000800080008000ull;
1126     }
1127 
1128     for (i = 0; i < oprsz / 4; i++) {
1129         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1130         float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1131         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1132     }
1133     clear_tail(d, oprsz, simd_maxsz(desc));
1134 }
1135 
1136 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1137                             void *venv, uint32_t desc)
1138 {
1139     CPUARMState *env = venv;
1140     do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1141              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1142 }
1143 
1144 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1145                             void *venv, uint32_t desc)
1146 {
1147     CPUARMState *env = venv;
1148     do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1149              get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1150 }
1151 
1152 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1153                          uint32_t desc, bool fz16)
1154 {
1155     intptr_t i, oprsz = simd_oprsz(desc);
1156     int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1157     int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1158     int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1159     int is_q = oprsz == 16;
1160     uint64_t n_4;
1161     float32 m_1;
1162 
1163     /* Pre-load all of the f16 data, avoiding overlap issues.  */
1164     n_4 = load4_f16(vn, is_q, is_2);
1165 
1166     /* Negate all inputs for FMLSL at once.  */
1167     if (is_s) {
1168         n_4 ^= 0x8000800080008000ull;
1169     }
1170 
1171     m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1172 
1173     for (i = 0; i < oprsz / 4; i++) {
1174         float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1175         d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1176     }
1177     clear_tail(d, oprsz, simd_maxsz(desc));
1178 }
1179 
1180 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1181                                 void *venv, uint32_t desc)
1182 {
1183     CPUARMState *env = venv;
1184     do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1185                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1186 }
1187 
1188 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1189                                 void *venv, uint32_t desc)
1190 {
1191     CPUARMState *env = venv;
1192     do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1193                  get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1194 }
1195 
1196 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1197 {
1198     intptr_t i, opr_sz = simd_oprsz(desc);
1199     int8_t *d = vd, *n = vn, *m = vm;
1200 
1201     for (i = 0; i < opr_sz; ++i) {
1202         int8_t mm = m[i];
1203         int8_t nn = n[i];
1204         int8_t res = 0;
1205         if (mm >= 0) {
1206             if (mm < 8) {
1207                 res = nn << mm;
1208             }
1209         } else {
1210             res = nn >> (mm > -8 ? -mm : 7);
1211         }
1212         d[i] = res;
1213     }
1214     clear_tail(d, opr_sz, simd_maxsz(desc));
1215 }
1216 
1217 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1218 {
1219     intptr_t i, opr_sz = simd_oprsz(desc);
1220     int16_t *d = vd, *n = vn, *m = vm;
1221 
1222     for (i = 0; i < opr_sz / 2; ++i) {
1223         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1224         int16_t nn = n[i];
1225         int16_t res = 0;
1226         if (mm >= 0) {
1227             if (mm < 16) {
1228                 res = nn << mm;
1229             }
1230         } else {
1231             res = nn >> (mm > -16 ? -mm : 15);
1232         }
1233         d[i] = res;
1234     }
1235     clear_tail(d, opr_sz, simd_maxsz(desc));
1236 }
1237 
1238 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1239 {
1240     intptr_t i, opr_sz = simd_oprsz(desc);
1241     uint8_t *d = vd, *n = vn, *m = vm;
1242 
1243     for (i = 0; i < opr_sz; ++i) {
1244         int8_t mm = m[i];
1245         uint8_t nn = n[i];
1246         uint8_t res = 0;
1247         if (mm >= 0) {
1248             if (mm < 8) {
1249                 res = nn << mm;
1250             }
1251         } else {
1252             if (mm > -8) {
1253                 res = nn >> -mm;
1254             }
1255         }
1256         d[i] = res;
1257     }
1258     clear_tail(d, opr_sz, simd_maxsz(desc));
1259 }
1260 
1261 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1262 {
1263     intptr_t i, opr_sz = simd_oprsz(desc);
1264     uint16_t *d = vd, *n = vn, *m = vm;
1265 
1266     for (i = 0; i < opr_sz / 2; ++i) {
1267         int8_t mm = m[i];   /* only 8 bits of shift are significant */
1268         uint16_t nn = n[i];
1269         uint16_t res = 0;
1270         if (mm >= 0) {
1271             if (mm < 16) {
1272                 res = nn << mm;
1273             }
1274         } else {
1275             if (mm > -16) {
1276                 res = nn >> -mm;
1277             }
1278         }
1279         d[i] = res;
1280     }
1281     clear_tail(d, opr_sz, simd_maxsz(desc));
1282 }
1283 
1284 /*
1285  * 8x8->8 polynomial multiply.
1286  *
1287  * Polynomial multiplication is like integer multiplication except the
1288  * partial products are XORed, not added.
1289  *
1290  * TODO: expose this as a generic vector operation, as it is a common
1291  * crypto building block.
1292  */
1293 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1294 {
1295     intptr_t i, j, opr_sz = simd_oprsz(desc);
1296     uint64_t *d = vd, *n = vn, *m = vm;
1297 
1298     for (i = 0; i < opr_sz / 8; ++i) {
1299         uint64_t nn = n[i];
1300         uint64_t mm = m[i];
1301         uint64_t rr = 0;
1302 
1303         for (j = 0; j < 8; ++j) {
1304             uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
1305             rr ^= mm & mask;
1306             mm = (mm << 1) & 0xfefefefefefefefeull;
1307             nn >>= 1;
1308         }
1309         d[i] = rr;
1310     }
1311     clear_tail(d, opr_sz, simd_maxsz(desc));
1312 }
1313 
1314 /*
1315  * 64x64->128 polynomial multiply.
1316  * Because of the lanes are not accessed in strict columns,
1317  * this probably cannot be turned into a generic helper.
1318  */
1319 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
1320 {
1321     intptr_t i, j, opr_sz = simd_oprsz(desc);
1322     intptr_t hi = simd_data(desc);
1323     uint64_t *d = vd, *n = vn, *m = vm;
1324 
1325     for (i = 0; i < opr_sz / 8; i += 2) {
1326         uint64_t nn = n[i + hi];
1327         uint64_t mm = m[i + hi];
1328         uint64_t rhi = 0;
1329         uint64_t rlo = 0;
1330 
1331         /* Bit 0 can only influence the low 64-bit result.  */
1332         if (nn & 1) {
1333             rlo = mm;
1334         }
1335 
1336         for (j = 1; j < 64; ++j) {
1337             uint64_t mask = -((nn >> j) & 1);
1338             rlo ^= (mm << j) & mask;
1339             rhi ^= (mm >> (64 - j)) & mask;
1340         }
1341         d[i] = rlo;
1342         d[i + 1] = rhi;
1343     }
1344     clear_tail(d, opr_sz, simd_maxsz(desc));
1345 }
1346 
1347 /*
1348  * 8x8->16 polynomial multiply.
1349  *
1350  * The byte inputs are expanded to (or extracted from) half-words.
1351  * Note that neon and sve2 get the inputs from different positions.
1352  * This allows 4 bytes to be processed in parallel with uint64_t.
1353  */
1354 
1355 static uint64_t expand_byte_to_half(uint64_t x)
1356 {
1357     return  (x & 0x000000ff)
1358          | ((x & 0x0000ff00) << 8)
1359          | ((x & 0x00ff0000) << 16)
1360          | ((x & 0xff000000) << 24);
1361 }
1362 
1363 static uint64_t pmull_h(uint64_t op1, uint64_t op2)
1364 {
1365     uint64_t result = 0;
1366     int i;
1367 
1368     for (i = 0; i < 8; ++i) {
1369         uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
1370         result ^= op2 & mask;
1371         op1 >>= 1;
1372         op2 <<= 1;
1373     }
1374     return result;
1375 }
1376 
1377 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
1378 {
1379     int hi = simd_data(desc);
1380     uint64_t *d = vd, *n = vn, *m = vm;
1381     uint64_t nn = n[hi], mm = m[hi];
1382 
1383     d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
1384     nn >>= 32;
1385     mm >>= 32;
1386     d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
1387 
1388     clear_tail(d, 16, simd_maxsz(desc));
1389 }
1390 
1391 #ifdef TARGET_AARCH64
1392 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
1393 {
1394     int shift = simd_data(desc) * 8;
1395     intptr_t i, opr_sz = simd_oprsz(desc);
1396     uint64_t *d = vd, *n = vn, *m = vm;
1397 
1398     for (i = 0; i < opr_sz / 8; ++i) {
1399         uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
1400         uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
1401 
1402         d[i] = pmull_h(nn, mm);
1403     }
1404 }
1405 #endif
1406 
1407 #define DO_CMP0(NAME, TYPE, OP)                         \
1408 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)    \
1409 {                                                       \
1410     intptr_t i, opr_sz = simd_oprsz(desc);              \
1411     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {        \
1412         TYPE nn = *(TYPE *)(vn + i);                    \
1413         *(TYPE *)(vd + i) = -(nn OP 0);                 \
1414     }                                                   \
1415     clear_tail(vd, opr_sz, simd_maxsz(desc));           \
1416 }
1417 
1418 DO_CMP0(gvec_ceq0_b, int8_t, ==)
1419 DO_CMP0(gvec_clt0_b, int8_t, <)
1420 DO_CMP0(gvec_cle0_b, int8_t, <=)
1421 DO_CMP0(gvec_cgt0_b, int8_t, >)
1422 DO_CMP0(gvec_cge0_b, int8_t, >=)
1423 
1424 DO_CMP0(gvec_ceq0_h, int16_t, ==)
1425 DO_CMP0(gvec_clt0_h, int16_t, <)
1426 DO_CMP0(gvec_cle0_h, int16_t, <=)
1427 DO_CMP0(gvec_cgt0_h, int16_t, >)
1428 DO_CMP0(gvec_cge0_h, int16_t, >=)
1429 
1430 #undef DO_CMP0
1431 
1432 #define DO_ABD(NAME, TYPE)                                      \
1433 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1434 {                                                               \
1435     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1436     TYPE *d = vd, *n = vn, *m = vm;                             \
1437                                                                 \
1438     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
1439         d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];         \
1440     }                                                           \
1441     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
1442 }
1443 
1444 DO_ABD(gvec_sabd_b, int8_t)
1445 DO_ABD(gvec_sabd_h, int16_t)
1446 DO_ABD(gvec_sabd_s, int32_t)
1447 DO_ABD(gvec_sabd_d, int64_t)
1448 
1449 DO_ABD(gvec_uabd_b, uint8_t)
1450 DO_ABD(gvec_uabd_h, uint16_t)
1451 DO_ABD(gvec_uabd_s, uint32_t)
1452 DO_ABD(gvec_uabd_d, uint64_t)
1453 
1454 #undef DO_ABD
1455 
1456 #define DO_ABA(NAME, TYPE)                                      \
1457 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1458 {                                                               \
1459     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1460     TYPE *d = vd, *n = vn, *m = vm;                             \
1461                                                                 \
1462     for (i = 0; i < opr_sz / sizeof(TYPE); ++i) {               \
1463         d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i];        \
1464     }                                                           \
1465     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
1466 }
1467 
1468 DO_ABA(gvec_saba_b, int8_t)
1469 DO_ABA(gvec_saba_h, int16_t)
1470 DO_ABA(gvec_saba_s, int32_t)
1471 DO_ABA(gvec_saba_d, int64_t)
1472 
1473 DO_ABA(gvec_uaba_b, uint8_t)
1474 DO_ABA(gvec_uaba_h, uint16_t)
1475 DO_ABA(gvec_uaba_s, uint32_t)
1476 DO_ABA(gvec_uaba_d, uint64_t)
1477 
1478 #undef DO_ABA
1479