xref: /qemu/target/arm/tcg/neon_helper.c (revision b586c86a8ea1bbf2bc7070153b38b8c75ff8b979)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "tcg/tcg-gvec-desc.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15 
16 #define HELPER_H "tcg/helper.h"
17 #include "exec/helper-proto.h.inc"
18 
19 #define SIGNBIT (uint32_t)0x80000000
20 #define SIGNBIT64 ((uint64_t)1 << 63)
21 
22 #define SET_QC() env->vfp.qc[0] = 1
23 
24 #define NEON_TYPE1(name, type) \
25 typedef struct \
26 { \
27     type v1; \
28 } neon_##name;
29 #if HOST_BIG_ENDIAN
30 #define NEON_TYPE2(name, type) \
31 typedef struct \
32 { \
33     type v2; \
34     type v1; \
35 } neon_##name;
36 #define NEON_TYPE4(name, type) \
37 typedef struct \
38 { \
39     type v4; \
40     type v3; \
41     type v2; \
42     type v1; \
43 } neon_##name;
44 #else
45 #define NEON_TYPE2(name, type) \
46 typedef struct \
47 { \
48     type v1; \
49     type v2; \
50 } neon_##name;
51 #define NEON_TYPE4(name, type) \
52 typedef struct \
53 { \
54     type v1; \
55     type v2; \
56     type v3; \
57     type v4; \
58 } neon_##name;
59 #endif
60 
NEON_TYPE4(s8,int8_t)61 NEON_TYPE4(s8, int8_t)
62 NEON_TYPE4(u8, uint8_t)
63 NEON_TYPE2(s16, int16_t)
64 NEON_TYPE2(u16, uint16_t)
65 NEON_TYPE1(s32, int32_t)
66 NEON_TYPE1(u32, uint32_t)
67 #undef NEON_TYPE4
68 #undef NEON_TYPE2
69 #undef NEON_TYPE1
70 
71 /* Copy from a uint32_t to a vector structure type.  */
72 #define NEON_UNPACK(vtype, dest, val) do { \
73     union { \
74         vtype v; \
75         uint32_t i; \
76     } conv_u; \
77     conv_u.i = (val); \
78     dest = conv_u.v; \
79     } while(0)
80 
81 /* Copy from a vector structure type to a uint32_t.  */
82 #define NEON_PACK(vtype, dest, val) do { \
83     union { \
84         vtype v; \
85         uint32_t i; \
86     } conv_u; \
87     conv_u.v = (val); \
88     dest = conv_u.i; \
89     } while(0)
90 
91 #define NEON_DO1 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
93 #define NEON_DO2 \
94     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
96 #define NEON_DO4 \
97     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
98     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
99     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
100     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
101 
102 #define NEON_VOP_BODY(vtype, n) \
103 { \
104     uint32_t res; \
105     vtype vsrc1; \
106     vtype vsrc2; \
107     vtype vdest; \
108     NEON_UNPACK(vtype, vsrc1, arg1); \
109     NEON_UNPACK(vtype, vsrc2, arg2); \
110     NEON_DO##n; \
111     NEON_PACK(vtype, res, vdest); \
112     return res; \
113 }
114 
115 #define NEON_VOP(name, vtype, n) \
116 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
117 NEON_VOP_BODY(vtype, n)
118 
119 #define NEON_VOP_ENV(name, vtype, n) \
120 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
121 NEON_VOP_BODY(vtype, n)
122 
123 #define NEON_GVEC_VOP2(name, vtype) \
124 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
125 {                                                               \
126     intptr_t i, opr_sz = simd_oprsz(desc);                      \
127     vtype *d = vd, *n = vn, *m = vm;                            \
128     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
129         NEON_FN(d[i], n[i], m[i]);                              \
130     }                                                           \
131     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
132 }
133 
134 #define NEON_GVEC_VOP2_ENV(name, vtype) \
135 void HELPER(name)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) \
136 {                                                               \
137     intptr_t i, opr_sz = simd_oprsz(desc);                      \
138     vtype *d = vd, *n = vn, *m = vm;                            \
139     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
140         NEON_FN(d[i], n[i], m[i]);                              \
141     }                                                           \
142     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
143 }
144 
145 #define NEON_GVEC_VOP2i_ENV(name, vtype) \
146 void HELPER(name)(void *vd, void *vn, CPUARMState *env, uint32_t desc) \
147 {                                                               \
148     intptr_t i, opr_sz = simd_oprsz(desc);                      \
149     int imm = simd_data(desc);                                  \
150     vtype *d = vd, *n = vn;                                     \
151     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
152         NEON_FN(d[i], n[i], imm);                               \
153     }                                                           \
154     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
155 }
156 
157 /* Pairwise operations.  */
158 /* For 32-bit elements each segment only contains a single element, so
159    the elementwise and pairwise operations are the same.  */
160 #define NEON_PDO2 \
161     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
162     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
163 #define NEON_PDO4 \
164     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
165     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
166     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
167     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
168 
169 #define NEON_POP(name, vtype, n) \
170 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
171 { \
172     uint32_t res; \
173     vtype vsrc1; \
174     vtype vsrc2; \
175     vtype vdest; \
176     NEON_UNPACK(vtype, vsrc1, arg1); \
177     NEON_UNPACK(vtype, vsrc2, arg2); \
178     NEON_PDO##n; \
179     NEON_PACK(vtype, res, vdest); \
180     return res; \
181 }
182 
183 /* Unary operators.  */
184 #define NEON_VOP1(name, vtype, n) \
185 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
186 { \
187     vtype vsrc1; \
188     vtype vdest; \
189     NEON_UNPACK(vtype, vsrc1, arg); \
190     NEON_DO##n; \
191     NEON_PACK(vtype, arg, vdest); \
192     return arg; \
193 }
194 
195 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
196 NEON_POP(pmin_s8, neon_s8, 4)
197 NEON_POP(pmin_u8, neon_u8, 4)
198 NEON_POP(pmin_s16, neon_s16, 2)
199 NEON_POP(pmin_u16, neon_u16, 2)
200 #undef NEON_FN
201 
202 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
203 NEON_POP(pmax_s8, neon_s8, 4)
204 NEON_POP(pmax_u8, neon_u8, 4)
205 NEON_POP(pmax_s16, neon_s16, 2)
206 NEON_POP(pmax_u16, neon_u16, 2)
207 #undef NEON_FN
208 
209 #define NEON_FN(dest, src1, src2) \
210     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
211 NEON_VOP(shl_u16, neon_u16, 2)
212 #undef NEON_FN
213 
214 #define NEON_FN(dest, src1, src2) \
215     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
216 NEON_VOP(shl_s16, neon_s16, 2)
217 #undef NEON_FN
218 
219 #define NEON_FN(dest, src1, src2) \
220     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
221 NEON_VOP(rshl_s8, neon_s8, 4)
222 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
223 #undef NEON_FN
224 
225 #define NEON_FN(dest, src1, src2) \
226     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
227 NEON_VOP(rshl_s16, neon_s16, 2)
228 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
229 #undef NEON_FN
230 
231 #define NEON_FN(dest, src1, src2) \
232     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
233 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
234 #undef NEON_FN
235 
236 #define NEON_FN(dest, src1, src2) \
237     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
238 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
239 #undef NEON_FN
240 
241 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
242 {
243     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
244 }
245 
HELPER(neon_rshl_s64)246 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
247 {
248     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
249 }
250 
251 #define NEON_FN(dest, src1, src2) \
252     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
253 NEON_VOP(rshl_u8, neon_u8, 4)
NEON_GVEC_VOP2(gvec_urshl_b,uint8_t)254 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
255 #undef NEON_FN
256 
257 #define NEON_FN(dest, src1, src2) \
258     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
259 NEON_VOP(rshl_u16, neon_u16, 2)
260 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
261 #undef NEON_FN
262 
263 #define NEON_FN(dest, src1, src2) \
264     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
265 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
266 #undef NEON_FN
267 
268 #define NEON_FN(dest, src1, src2) \
269     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
270 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
271 #undef NEON_FN
272 
273 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
274 {
275     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
276 }
277 
HELPER(neon_rshl_u64)278 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
279 {
280     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
281 }
282 
283 #define NEON_FN(dest, src1, src2) \
284     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
285 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
NEON_GVEC_VOP2_ENV(neon_uqshl_b,uint8_t)286 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
287 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
288 #undef NEON_FN
289 
290 #define NEON_FN(dest, src1, src2) \
291     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
292 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
293 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
294 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
295 #undef NEON_FN
296 
297 #define NEON_FN(dest, src1, src2) \
298     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
299 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
300 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
301 #undef NEON_FN
302 
303 #define NEON_FN(dest, src1, src2) \
304     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
305 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
306 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
307 #undef NEON_FN
308 
309 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
310 {
311     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
312 }
313 
HELPER(neon_qshl_u64)314 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
315 {
316     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
317 }
318 
319 #define NEON_FN(dest, src1, src2) \
320     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
321 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqshl_b,int8_t)322 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
323 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
324 #undef NEON_FN
325 
326 #define NEON_FN(dest, src1, src2) \
327     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
328 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
329 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
330 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
331 #undef NEON_FN
332 
333 #define NEON_FN(dest, src1, src2) \
334     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
335 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
336 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
337 #undef NEON_FN
338 
339 #define NEON_FN(dest, src1, src2) \
340     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
341 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
342 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
343 #undef NEON_FN
344 
345 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
346 {
347     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
348 }
349 
HELPER(neon_qshl_s64)350 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
351 {
352     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
353 }
354 
355 #define NEON_FN(dest, src1, src2) \
356     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
357 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
NEON_GVEC_VOP2i_ENV(neon_sqshlui_b,int8_t)358 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
359 #undef NEON_FN
360 
361 #define NEON_FN(dest, src1, src2) \
362     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
363 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
364 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
365 #undef NEON_FN
366 
367 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
368 {
369     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
370 }
371 
HELPER(neon_qshlu_s64)372 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
373 {
374     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
375 }
376 
377 #define NEON_FN(dest, src1, src2) \
378     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
NEON_GVEC_VOP2i_ENV(neon_sqshlui_s,int32_t)379 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
380 #undef NEON_FN
381 
382 #define NEON_FN(dest, src1, src2) \
383     (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
384 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
385 #undef NEON_FN
386 
387 #define NEON_FN(dest, src1, src2) \
388     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
389 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
390 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
391 #undef NEON_FN
392 
393 #define NEON_FN(dest, src1, src2) \
394     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
395 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
396 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
397 #undef NEON_FN
398 
399 #define NEON_FN(dest, src1, src2) \
400     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
401 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
402 #undef NEON_FN
403 
404 #define NEON_FN(dest, src1, src2) \
405     (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
406 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
407 #undef NEON_FN
408 
409 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
410 {
411     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
412 }
413 
HELPER(neon_qrshl_u64)414 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
415 {
416     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
417 }
418 
419 #define NEON_FN(dest, src1, src2) \
420     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
421 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqrshl_b,int8_t)422 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
423 #undef NEON_FN
424 
425 #define NEON_FN(dest, src1, src2) \
426     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
427 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
428 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
429 #undef NEON_FN
430 
431 #define NEON_FN(dest, src1, src2) \
432     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
433 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
434 #undef NEON_FN
435 
436 #define NEON_FN(dest, src1, src2) \
437     (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
438 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
439 #undef NEON_FN
440 
441 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
442 {
443     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
444 }
445 
HELPER(neon_qrshl_s64)446 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
447 {
448     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
449 }
450 
HELPER(neon_add_u8)451 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
452 {
453     uint32_t mask;
454     mask = (a ^ b) & 0x80808080u;
455     a &= ~0x80808080u;
456     b &= ~0x80808080u;
457     return (a + b) ^ mask;
458 }
459 
HELPER(neon_add_u16)460 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
461 {
462     uint32_t mask;
463     mask = (a ^ b) & 0x80008000u;
464     a &= ~0x80008000u;
465     b &= ~0x80008000u;
466     return (a + b) ^ mask;
467 }
468 
469 #define NEON_FN(dest, src1, src2) dest = src1 - src2
470 NEON_VOP(sub_u8, neon_u8, 4)
471 NEON_VOP(sub_u16, neon_u16, 2)
472 #undef NEON_FN
473 
474 #define NEON_FN(dest, src1, src2) dest = src1 * src2
475 NEON_VOP(mul_u8, neon_u8, 4)
476 NEON_VOP(mul_u16, neon_u16, 2)
477 #undef NEON_FN
478 
479 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
480 NEON_VOP(tst_u8, neon_u8, 4)
481 NEON_VOP(tst_u16, neon_u16, 2)
482 NEON_VOP(tst_u32, neon_u32, 1)
483 #undef NEON_FN
484 
485 /* Count Leading Sign/Zero Bits.  */
do_clz8(uint8_t x)486 static inline int do_clz8(uint8_t x)
487 {
488     int n;
489     for (n = 8; x; n--)
490         x >>= 1;
491     return n;
492 }
493 
do_clz16(uint16_t x)494 static inline int do_clz16(uint16_t x)
495 {
496     int n;
497     for (n = 16; x; n--)
498         x >>= 1;
499     return n;
500 }
501 
502 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
503 NEON_VOP1(clz_u8, neon_u8, 4)
504 #undef NEON_FN
505 
506 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
507 NEON_VOP1(clz_u16, neon_u16, 2)
508 #undef NEON_FN
509 
510 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
511 NEON_VOP1(cls_s8, neon_s8, 4)
512 #undef NEON_FN
513 
514 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
515 NEON_VOP1(cls_s16, neon_s16, 2)
516 #undef NEON_FN
517 
HELPER(neon_cls_s32)518 uint32_t HELPER(neon_cls_s32)(uint32_t x)
519 {
520     int count;
521     if ((int32_t)x < 0)
522         x = ~x;
523     for (count = 32; x; count--)
524         x = x >> 1;
525     return count - 1;
526 }
527 
528 #define NEON_QDMULH16(dest, src1, src2, round) do { \
529     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
530     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
531         SET_QC(); \
532         tmp = (tmp >> 31) ^ ~SIGNBIT; \
533     } else { \
534         tmp <<= 1; \
535     } \
536     if (round) { \
537         int32_t old = tmp; \
538         tmp += 1 << 15; \
539         if ((int32_t)tmp < old) { \
540             SET_QC(); \
541             tmp = SIGNBIT - 1; \
542         } \
543     } \
544     dest = tmp >> 16; \
545     } while(0)
546 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
547 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
548 #undef NEON_FN
549 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
550 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
551 #undef NEON_FN
552 #undef NEON_QDMULH16
553 
554 #define NEON_QDMULH32(dest, src1, src2, round) do { \
555     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
556     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
557         SET_QC(); \
558         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
559     } else { \
560         tmp <<= 1; \
561     } \
562     if (round) { \
563         int64_t old = tmp; \
564         tmp += (int64_t)1 << 31; \
565         if ((int64_t)tmp < old) { \
566             SET_QC(); \
567             tmp = SIGNBIT64 - 1; \
568         } \
569     } \
570     dest = tmp >> 32; \
571     } while(0)
572 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
573 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
574 #undef NEON_FN
575 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
576 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
577 #undef NEON_FN
578 #undef NEON_QDMULH32
579 
580 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_u8)581 uint64_t HELPER(neon_narrow_u8)(uint64_t x)
582 {
583     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
584            | ((x >> 24) & 0xff000000u);
585 }
586 
587 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_u16)588 uint64_t HELPER(neon_narrow_u16)(uint64_t x)
589 {
590     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
591 }
592 
HELPER(neon_narrow_high_u8)593 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
594 {
595     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
596             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
597 }
598 
HELPER(neon_narrow_high_u16)599 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
600 {
601     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
602 }
603 
HELPER(neon_narrow_round_high_u8)604 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
605 {
606     x &= 0xff80ff80ff80ff80ull;
607     x += 0x0080008000800080ull;
608     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
609             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
610 }
611 
HELPER(neon_narrow_round_high_u16)612 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
613 {
614     x &= 0xffff8000ffff8000ull;
615     x += 0x0000800000008000ull;
616     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
617 }
618 
619 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat8)620 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
621 {
622     uint16_t s;
623     uint8_t d;
624     uint32_t res = 0;
625 #define SAT8(n) \
626     s = x >> n; \
627     if (s & 0x8000) { \
628         SET_QC(); \
629     } else { \
630         if (s > 0xff) { \
631             d = 0xff; \
632             SET_QC(); \
633         } else  { \
634             d = s; \
635         } \
636         res |= (uint32_t)d << (n / 2); \
637     }
638 
639     SAT8(0);
640     SAT8(16);
641     SAT8(32);
642     SAT8(48);
643 #undef SAT8
644     return res;
645 }
646 
647 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u8)648 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
649 {
650     uint16_t s;
651     uint8_t d;
652     uint32_t res = 0;
653 #define SAT8(n) \
654     s = x >> n; \
655     if (s > 0xff) { \
656         d = 0xff; \
657         SET_QC(); \
658     } else  { \
659         d = s; \
660     } \
661     res |= (uint32_t)d << (n / 2);
662 
663     SAT8(0);
664     SAT8(16);
665     SAT8(32);
666     SAT8(48);
667 #undef SAT8
668     return res;
669 }
670 
671 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s8)672 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
673 {
674     int16_t s;
675     uint8_t d;
676     uint32_t res = 0;
677 #define SAT8(n) \
678     s = x >> n; \
679     if (s != (int8_t)s) { \
680         d = (s >> 15) ^ 0x7f; \
681         SET_QC(); \
682     } else  { \
683         d = s; \
684     } \
685     res |= (uint32_t)d << (n / 2);
686 
687     SAT8(0);
688     SAT8(16);
689     SAT8(32);
690     SAT8(48);
691 #undef SAT8
692     return res;
693 }
694 
695 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat16)696 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
697 {
698     uint32_t high;
699     uint32_t low;
700     low = x;
701     if (low & 0x80000000) {
702         low = 0;
703         SET_QC();
704     } else if (low > 0xffff) {
705         low = 0xffff;
706         SET_QC();
707     }
708     high = x >> 32;
709     if (high & 0x80000000) {
710         high = 0;
711         SET_QC();
712     } else if (high > 0xffff) {
713         high = 0xffff;
714         SET_QC();
715     }
716     return deposit32(low, 16, 16, high);
717 }
718 
719 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u16)720 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
721 {
722     uint32_t high;
723     uint32_t low;
724     low = x;
725     if (low > 0xffff) {
726         low = 0xffff;
727         SET_QC();
728     }
729     high = x >> 32;
730     if (high > 0xffff) {
731         high = 0xffff;
732         SET_QC();
733     }
734     return deposit32(low, 16, 16, high);
735 }
736 
737 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s16)738 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
739 {
740     int32_t low;
741     int32_t high;
742     low = x;
743     if (low != (int16_t)low) {
744         low = (low >> 31) ^ 0x7fff;
745         SET_QC();
746     }
747     high = x >> 32;
748     if (high != (int16_t)high) {
749         high = (high >> 31) ^ 0x7fff;
750         SET_QC();
751     }
752     return deposit32(low, 16, 16, high);
753 }
754 
755 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat32)756 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
757 {
758     if (x & 0x8000000000000000ull) {
759         SET_QC();
760         return 0;
761     }
762     if (x > 0xffffffffu) {
763         SET_QC();
764         return 0xffffffffu;
765     }
766     return x;
767 }
768 
769 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u32)770 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
771 {
772     if (x > 0xffffffffu) {
773         SET_QC();
774         return 0xffffffffu;
775     }
776     return x;
777 }
778 
779 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s32)780 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
781 {
782     if ((int64_t)x != (int32_t)x) {
783         SET_QC();
784         return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff;
785     }
786     return (uint32_t)x;
787 }
788 
HELPER(neon_widen_u8)789 uint64_t HELPER(neon_widen_u8)(uint32_t x)
790 {
791     uint64_t tmp;
792     uint64_t ret;
793     ret = (uint8_t)x;
794     tmp = (uint8_t)(x >> 8);
795     ret |= tmp << 16;
796     tmp = (uint8_t)(x >> 16);
797     ret |= tmp << 32;
798     tmp = (uint8_t)(x >> 24);
799     ret |= tmp << 48;
800     return ret;
801 }
802 
HELPER(neon_widen_s8)803 uint64_t HELPER(neon_widen_s8)(uint32_t x)
804 {
805     uint64_t tmp;
806     uint64_t ret;
807     ret = (uint16_t)(int8_t)x;
808     tmp = (uint16_t)(int8_t)(x >> 8);
809     ret |= tmp << 16;
810     tmp = (uint16_t)(int8_t)(x >> 16);
811     ret |= tmp << 32;
812     tmp = (uint16_t)(int8_t)(x >> 24);
813     ret |= tmp << 48;
814     return ret;
815 }
816 
HELPER(neon_widen_u16)817 uint64_t HELPER(neon_widen_u16)(uint32_t x)
818 {
819     uint64_t high = (uint16_t)(x >> 16);
820     return ((uint16_t)x) | (high << 32);
821 }
822 
HELPER(neon_widen_s16)823 uint64_t HELPER(neon_widen_s16)(uint32_t x)
824 {
825     uint64_t high = (int16_t)(x >> 16);
826     return ((uint32_t)(int16_t)x) | (high << 32);
827 }
828 
829 /* Pairwise long add: add pairs of adjacent elements into
830  * double-width elements in the result (eg _s8 is an 8x8->16 op)
831  */
HELPER(neon_addlp_s8)832 uint64_t HELPER(neon_addlp_s8)(uint64_t a)
833 {
834     uint64_t nsignmask = 0x0080008000800080ULL;
835     uint64_t wsignmask = 0x8000800080008000ULL;
836     uint64_t elementmask = 0x00ff00ff00ff00ffULL;
837     uint64_t tmp1, tmp2;
838     uint64_t res, signres;
839 
840     /* Extract odd elements, sign extend each to a 16 bit field */
841     tmp1 = a & elementmask;
842     tmp1 ^= nsignmask;
843     tmp1 |= wsignmask;
844     tmp1 = (tmp1 - nsignmask) ^ wsignmask;
845     /* Ditto for the even elements */
846     tmp2 = (a >> 8) & elementmask;
847     tmp2 ^= nsignmask;
848     tmp2 |= wsignmask;
849     tmp2 = (tmp2 - nsignmask) ^ wsignmask;
850 
851     /* calculate the result by summing bits 0..14, 16..22, etc,
852      * and then adjusting the sign bits 15, 23, etc manually.
853      * This ensures the addition can't overflow the 16 bit field.
854      */
855     signres = (tmp1 ^ tmp2) & wsignmask;
856     res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
857     res ^= signres;
858 
859     return res;
860 }
861 
HELPER(neon_addlp_s16)862 uint64_t HELPER(neon_addlp_s16)(uint64_t a)
863 {
864     int32_t reslo, reshi;
865 
866     reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
867     reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
868 
869     return (uint32_t)reslo | (((uint64_t)reshi) << 32);
870 }
871 
HELPER(neon_addl_saturate_s32)872 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
873 {
874     uint32_t x, y;
875     uint32_t low, high;
876 
877     x = a;
878     y = b;
879     low = x + y;
880     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
881         SET_QC();
882         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
883     }
884     x = a >> 32;
885     y = b >> 32;
886     high = x + y;
887     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
888         SET_QC();
889         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
890     }
891     return low | ((uint64_t)high << 32);
892 }
893 
HELPER(neon_addl_saturate_s64)894 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
895 {
896     uint64_t result;
897 
898     result = a + b;
899     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
900         SET_QC();
901         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
902     }
903     return result;
904 }
905 
906 /* We have to do the arithmetic in a larger type than
907  * the input type, because for example with a signed 32 bit
908  * op the absolute difference can overflow a signed 32 bit value.
909  */
910 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
911     arithtype tmp_x = (intype)(x);                            \
912     arithtype tmp_y = (intype)(y);                            \
913     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
914     } while(0)
915 
HELPER(neon_abdl_u16)916 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
917 {
918     uint64_t tmp;
919     uint64_t result;
920     DO_ABD(result, a, b, uint8_t, uint32_t);
921     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
922     result |= tmp << 16;
923     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
924     result |= tmp << 32;
925     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
926     result |= tmp << 48;
927     return result;
928 }
929 
HELPER(neon_abdl_s16)930 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
931 {
932     uint64_t tmp;
933     uint64_t result;
934     DO_ABD(result, a, b, int8_t, int32_t);
935     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
936     result |= tmp << 16;
937     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
938     result |= tmp << 32;
939     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
940     result |= tmp << 48;
941     return result;
942 }
943 
HELPER(neon_abdl_u32)944 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
945 {
946     uint64_t tmp;
947     uint64_t result;
948     DO_ABD(result, a, b, uint16_t, uint32_t);
949     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
950     return result | (tmp << 32);
951 }
952 
HELPER(neon_abdl_s32)953 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
954 {
955     uint64_t tmp;
956     uint64_t result;
957     DO_ABD(result, a, b, int16_t, int32_t);
958     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
959     return result | (tmp << 32);
960 }
961 
HELPER(neon_abdl_u64)962 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
963 {
964     uint64_t result;
965     DO_ABD(result, a, b, uint32_t, uint64_t);
966     return result;
967 }
968 
HELPER(neon_abdl_s64)969 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
970 {
971     uint64_t result;
972     DO_ABD(result, a, b, int32_t, int64_t);
973     return result;
974 }
975 #undef DO_ABD
976 
977 /* Widening multiply. Named type is the source type.  */
978 #define DO_MULL(dest, x, y, type1, type2) do { \
979     type1 tmp_x = x; \
980     type1 tmp_y = y; \
981     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
982     } while(0)
983 
HELPER(neon_mull_u8)984 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
985 {
986     uint64_t tmp;
987     uint64_t result;
988 
989     DO_MULL(result, a, b, uint8_t, uint16_t);
990     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
991     result |= tmp << 16;
992     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
993     result |= tmp << 32;
994     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
995     result |= tmp << 48;
996     return result;
997 }
998 
HELPER(neon_mull_s8)999 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1000 {
1001     uint64_t tmp;
1002     uint64_t result;
1003 
1004     DO_MULL(result, a, b, int8_t, uint16_t);
1005     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1006     result |= tmp << 16;
1007     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1008     result |= tmp << 32;
1009     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1010     result |= tmp << 48;
1011     return result;
1012 }
1013 
HELPER(neon_mull_u16)1014 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1015 {
1016     uint64_t tmp;
1017     uint64_t result;
1018 
1019     DO_MULL(result, a, b, uint16_t, uint32_t);
1020     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1021     return result | (tmp << 32);
1022 }
1023 
HELPER(neon_mull_s16)1024 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1025 {
1026     uint64_t tmp;
1027     uint64_t result;
1028 
1029     DO_MULL(result, a, b, int16_t, uint32_t);
1030     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1031     return result | (tmp << 32);
1032 }
1033 
HELPER(neon_negl_u16)1034 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1035 {
1036     uint16_t tmp;
1037     uint64_t result;
1038     result = (uint16_t)-x;
1039     tmp = -(x >> 16);
1040     result |= (uint64_t)tmp << 16;
1041     tmp = -(x >> 32);
1042     result |= (uint64_t)tmp << 32;
1043     tmp = -(x >> 48);
1044     result |= (uint64_t)tmp << 48;
1045     return result;
1046 }
1047 
HELPER(neon_negl_u32)1048 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1049 {
1050     uint32_t low = -x;
1051     uint32_t high = -(x >> 32);
1052     return low | ((uint64_t)high << 32);
1053 }
1054 
1055 /* Saturating sign manipulation.  */
1056 /* ??? Make these use NEON_VOP1 */
1057 #define DO_QABS8(x) do { \
1058     if (x == (int8_t)0x80) { \
1059         x = 0x7f; \
1060         SET_QC(); \
1061     } else if (x < 0) { \
1062         x = -x; \
1063     }} while (0)
HELPER(neon_qabs_s8)1064 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1065 {
1066     neon_s8 vec;
1067     NEON_UNPACK(neon_s8, vec, x);
1068     DO_QABS8(vec.v1);
1069     DO_QABS8(vec.v2);
1070     DO_QABS8(vec.v3);
1071     DO_QABS8(vec.v4);
1072     NEON_PACK(neon_s8, x, vec);
1073     return x;
1074 }
1075 #undef DO_QABS8
1076 
1077 #define DO_QNEG8(x) do { \
1078     if (x == (int8_t)0x80) { \
1079         x = 0x7f; \
1080         SET_QC(); \
1081     } else { \
1082         x = -x; \
1083     }} while (0)
HELPER(neon_qneg_s8)1084 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1085 {
1086     neon_s8 vec;
1087     NEON_UNPACK(neon_s8, vec, x);
1088     DO_QNEG8(vec.v1);
1089     DO_QNEG8(vec.v2);
1090     DO_QNEG8(vec.v3);
1091     DO_QNEG8(vec.v4);
1092     NEON_PACK(neon_s8, x, vec);
1093     return x;
1094 }
1095 #undef DO_QNEG8
1096 
1097 #define DO_QABS16(x) do { \
1098     if (x == (int16_t)0x8000) { \
1099         x = 0x7fff; \
1100         SET_QC(); \
1101     } else if (x < 0) { \
1102         x = -x; \
1103     }} while (0)
HELPER(neon_qabs_s16)1104 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1105 {
1106     neon_s16 vec;
1107     NEON_UNPACK(neon_s16, vec, x);
1108     DO_QABS16(vec.v1);
1109     DO_QABS16(vec.v2);
1110     NEON_PACK(neon_s16, x, vec);
1111     return x;
1112 }
1113 #undef DO_QABS16
1114 
1115 #define DO_QNEG16(x) do { \
1116     if (x == (int16_t)0x8000) { \
1117         x = 0x7fff; \
1118         SET_QC(); \
1119     } else { \
1120         x = -x; \
1121     }} while (0)
HELPER(neon_qneg_s16)1122 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1123 {
1124     neon_s16 vec;
1125     NEON_UNPACK(neon_s16, vec, x);
1126     DO_QNEG16(vec.v1);
1127     DO_QNEG16(vec.v2);
1128     NEON_PACK(neon_s16, x, vec);
1129     return x;
1130 }
1131 #undef DO_QNEG16
1132 
HELPER(neon_qabs_s32)1133 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1134 {
1135     if (x == SIGNBIT) {
1136         SET_QC();
1137         x = ~SIGNBIT;
1138     } else if ((int32_t)x < 0) {
1139         x = -x;
1140     }
1141     return x;
1142 }
1143 
HELPER(neon_qneg_s32)1144 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1145 {
1146     if (x == SIGNBIT) {
1147         SET_QC();
1148         x = ~SIGNBIT;
1149     } else {
1150         x = -x;
1151     }
1152     return x;
1153 }
1154 
HELPER(neon_qabs_s64)1155 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1156 {
1157     if (x == SIGNBIT64) {
1158         SET_QC();
1159         x = ~SIGNBIT64;
1160     } else if ((int64_t)x < 0) {
1161         x = -x;
1162     }
1163     return x;
1164 }
1165 
HELPER(neon_qneg_s64)1166 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1167 {
1168     if (x == SIGNBIT64) {
1169         SET_QC();
1170         x = ~SIGNBIT64;
1171     } else {
1172         x = -x;
1173     }
1174     return x;
1175 }
1176 
1177 /* NEON Float helpers.  */
1178 
1179 /* Floating point comparisons produce an integer result.
1180  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1181  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1182  */
HELPER(neon_ceq_f32)1183 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, float_status *fpst)
1184 {
1185     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1186 }
1187 
HELPER(neon_cge_f32)1188 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1189 {
1190     return -float32_le(make_float32(b), make_float32(a), fpst);
1191 }
1192 
HELPER(neon_cgt_f32)1193 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1194 {
1195     return -float32_lt(make_float32(b), make_float32(a), fpst);
1196 }
1197 
HELPER(neon_acge_f32)1198 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1199 {
1200     float32 f0 = float32_abs(make_float32(a));
1201     float32 f1 = float32_abs(make_float32(b));
1202     return -float32_le(f1, f0, fpst);
1203 }
1204 
HELPER(neon_acgt_f32)1205 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1206 {
1207     float32 f0 = float32_abs(make_float32(a));
1208     float32 f1 = float32_abs(make_float32(b));
1209     return -float32_lt(f1, f0, fpst);
1210 }
1211 
HELPER(neon_acge_f64)1212 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, float_status *fpst)
1213 {
1214     float64 f0 = float64_abs(make_float64(a));
1215     float64 f1 = float64_abs(make_float64(b));
1216     return -float64_le(f1, f0, fpst);
1217 }
1218 
HELPER(neon_acgt_f64)1219 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, float_status *fpst)
1220 {
1221     float64 f0 = float64_abs(make_float64(a));
1222     float64 f1 = float64_abs(make_float64(b));
1223     return -float64_lt(f1, f0, fpst);
1224 }
1225 
1226 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1227 
HELPER(neon_qunzip8)1228 void HELPER(neon_qunzip8)(void *vd, void *vm)
1229 {
1230     uint64_t *rd = vd, *rm = vm;
1231     uint64_t zd0 = rd[0], zd1 = rd[1];
1232     uint64_t zm0 = rm[0], zm1 = rm[1];
1233 
1234     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1235         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1236         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1237         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1238     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1239         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1240         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1241         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1242     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1243         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1244         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1245         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1246     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1247         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1248         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1249         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1250 
1251     rm[0] = m0;
1252     rm[1] = m1;
1253     rd[0] = d0;
1254     rd[1] = d1;
1255 }
1256 
HELPER(neon_qunzip16)1257 void HELPER(neon_qunzip16)(void *vd, void *vm)
1258 {
1259     uint64_t *rd = vd, *rm = vm;
1260     uint64_t zd0 = rd[0], zd1 = rd[1];
1261     uint64_t zm0 = rm[0], zm1 = rm[1];
1262 
1263     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1264         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1265     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1266         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1267     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1268         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1269     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1270         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1271 
1272     rm[0] = m0;
1273     rm[1] = m1;
1274     rd[0] = d0;
1275     rd[1] = d1;
1276 }
1277 
HELPER(neon_qunzip32)1278 void HELPER(neon_qunzip32)(void *vd, void *vm)
1279 {
1280     uint64_t *rd = vd, *rm = vm;
1281     uint64_t zd0 = rd[0], zd1 = rd[1];
1282     uint64_t zm0 = rm[0], zm1 = rm[1];
1283 
1284     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1285     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1286     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1287     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1288 
1289     rm[0] = m0;
1290     rm[1] = m1;
1291     rd[0] = d0;
1292     rd[1] = d1;
1293 }
1294 
HELPER(neon_unzip8)1295 void HELPER(neon_unzip8)(void *vd, void *vm)
1296 {
1297     uint64_t *rd = vd, *rm = vm;
1298     uint64_t zd = rd[0], zm = rm[0];
1299 
1300     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1301         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1302         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1303         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1304     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1305         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1306         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1307         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1308 
1309     rm[0] = m0;
1310     rd[0] = d0;
1311 }
1312 
HELPER(neon_unzip16)1313 void HELPER(neon_unzip16)(void *vd, void *vm)
1314 {
1315     uint64_t *rd = vd, *rm = vm;
1316     uint64_t zd = rd[0], zm = rm[0];
1317 
1318     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1319         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1320     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1321         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1322 
1323     rm[0] = m0;
1324     rd[0] = d0;
1325 }
1326 
HELPER(neon_qzip8)1327 void HELPER(neon_qzip8)(void *vd, void *vm)
1328 {
1329     uint64_t *rd = vd, *rm = vm;
1330     uint64_t zd0 = rd[0], zd1 = rd[1];
1331     uint64_t zm0 = rm[0], zm1 = rm[1];
1332 
1333     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1334         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1335         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1336         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1337     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1338         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1339         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1340         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1341     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1342         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1343         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1344         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1345     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1346         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1347         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1348         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1349 
1350     rm[0] = m0;
1351     rm[1] = m1;
1352     rd[0] = d0;
1353     rd[1] = d1;
1354 }
1355 
HELPER(neon_qzip16)1356 void HELPER(neon_qzip16)(void *vd, void *vm)
1357 {
1358     uint64_t *rd = vd, *rm = vm;
1359     uint64_t zd0 = rd[0], zd1 = rd[1];
1360     uint64_t zm0 = rm[0], zm1 = rm[1];
1361 
1362     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1363         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1364     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1365         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1366     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1367         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1368     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1369         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1370 
1371     rm[0] = m0;
1372     rm[1] = m1;
1373     rd[0] = d0;
1374     rd[1] = d1;
1375 }
1376 
HELPER(neon_qzip32)1377 void HELPER(neon_qzip32)(void *vd, void *vm)
1378 {
1379     uint64_t *rd = vd, *rm = vm;
1380     uint64_t zd0 = rd[0], zd1 = rd[1];
1381     uint64_t zm0 = rm[0], zm1 = rm[1];
1382 
1383     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1384     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1385     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1386     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1387 
1388     rm[0] = m0;
1389     rm[1] = m1;
1390     rd[0] = d0;
1391     rd[1] = d1;
1392 }
1393 
HELPER(neon_zip8)1394 void HELPER(neon_zip8)(void *vd, void *vm)
1395 {
1396     uint64_t *rd = vd, *rm = vm;
1397     uint64_t zd = rd[0], zm = rm[0];
1398 
1399     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1400         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1401         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1402         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1403     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1404         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1405         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1406         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1407 
1408     rm[0] = m0;
1409     rd[0] = d0;
1410 }
1411 
HELPER(neon_zip16)1412 void HELPER(neon_zip16)(void *vd, void *vm)
1413 {
1414     uint64_t *rd = vd, *rm = vm;
1415     uint64_t zd = rd[0], zm = rm[0];
1416 
1417     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1418         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1419     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1420         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1421 
1422     rm[0] = m0;
1423     rd[0] = d0;
1424 }
1425