xref: /qemu/target/arm/tcg/neon_helper.c (revision df6fe2abf2e990f767ce755d426bc439c7bba336)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "tcg/tcg-gvec-desc.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15 
16 #define HELPER_H "tcg/helper.h"
17 #include "exec/helper-proto.h.inc"
18 
19 #define SIGNBIT (uint32_t)0x80000000
20 #define SIGNBIT64 ((uint64_t)1 << 63)
21 
22 #define SET_QC() env->vfp.qc[0] = 1
23 
24 #define NEON_TYPE1(name, type) \
25 typedef struct \
26 { \
27     type v1; \
28 } neon_##name;
29 #if HOST_BIG_ENDIAN
30 #define NEON_TYPE2(name, type) \
31 typedef struct \
32 { \
33     type v2; \
34     type v1; \
35 } neon_##name;
36 #define NEON_TYPE4(name, type) \
37 typedef struct \
38 { \
39     type v4; \
40     type v3; \
41     type v2; \
42     type v1; \
43 } neon_##name;
44 #else
45 #define NEON_TYPE2(name, type) \
46 typedef struct \
47 { \
48     type v1; \
49     type v2; \
50 } neon_##name;
51 #define NEON_TYPE4(name, type) \
52 typedef struct \
53 { \
54     type v1; \
55     type v2; \
56     type v3; \
57     type v4; \
58 } neon_##name;
59 #endif
60 
61 NEON_TYPE4(s8, int8_t)
62 NEON_TYPE4(u8, uint8_t)
63 NEON_TYPE2(s16, int16_t)
64 NEON_TYPE2(u16, uint16_t)
65 NEON_TYPE1(s32, int32_t)
66 NEON_TYPE1(u32, uint32_t)
67 #undef NEON_TYPE4
68 #undef NEON_TYPE2
69 #undef NEON_TYPE1
70 
71 /* Copy from a uint32_t to a vector structure type.  */
72 #define NEON_UNPACK(vtype, dest, val) do { \
73     union { \
74         vtype v; \
75         uint32_t i; \
76     } conv_u; \
77     conv_u.i = (val); \
78     dest = conv_u.v; \
79     } while(0)
80 
81 /* Copy from a vector structure type to a uint32_t.  */
82 #define NEON_PACK(vtype, dest, val) do { \
83     union { \
84         vtype v; \
85         uint32_t i; \
86     } conv_u; \
87     conv_u.v = (val); \
88     dest = conv_u.i; \
89     } while(0)
90 
91 #define NEON_DO1 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
93 #define NEON_DO2 \
94     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
96 #define NEON_DO4 \
97     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
98     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
99     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
100     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
101 
102 #define NEON_VOP_BODY(vtype, n) \
103 { \
104     uint32_t res; \
105     vtype vsrc1; \
106     vtype vsrc2; \
107     vtype vdest; \
108     NEON_UNPACK(vtype, vsrc1, arg1); \
109     NEON_UNPACK(vtype, vsrc2, arg2); \
110     NEON_DO##n; \
111     NEON_PACK(vtype, res, vdest); \
112     return res; \
113 }
114 
115 #define NEON_VOP(name, vtype, n) \
116 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
117 NEON_VOP_BODY(vtype, n)
118 
119 #define NEON_VOP_ENV(name, vtype, n) \
120 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
121 NEON_VOP_BODY(vtype, n)
122 
123 #define NEON_GVEC_VOP2(name, vtype) \
124 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
125 {                                                               \
126     intptr_t i, opr_sz = simd_oprsz(desc);                      \
127     vtype *d = vd, *n = vn, *m = vm;                            \
128     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
129         NEON_FN(d[i], n[i], m[i]);                              \
130     }                                                           \
131     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
132 }
133 
134 #define NEON_GVEC_VOP2_ENV(name, vtype) \
135 void HELPER(name)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) \
136 {                                                               \
137     intptr_t i, opr_sz = simd_oprsz(desc);                      \
138     vtype *d = vd, *n = vn, *m = vm;                            \
139     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
140         NEON_FN(d[i], n[i], m[i]);                              \
141     }                                                           \
142     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
143 }
144 
145 #define NEON_GVEC_VOP2i_ENV(name, vtype) \
146 void HELPER(name)(void *vd, void *vn, CPUARMState *env, uint32_t desc) \
147 {                                                               \
148     intptr_t i, opr_sz = simd_oprsz(desc);                      \
149     int imm = simd_data(desc);                                  \
150     vtype *d = vd, *n = vn;                                     \
151     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
152         NEON_FN(d[i], n[i], imm);                               \
153     }                                                           \
154     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
155 }
156 
157 /* Pairwise operations.  */
158 /* For 32-bit elements each segment only contains a single element, so
159    the elementwise and pairwise operations are the same.  */
160 #define NEON_PDO2 \
161     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
162     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
163 #define NEON_PDO4 \
164     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
165     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
166     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
167     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
168 
169 #define NEON_POP(name, vtype, n) \
170 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
171 { \
172     uint32_t res; \
173     vtype vsrc1; \
174     vtype vsrc2; \
175     vtype vdest; \
176     NEON_UNPACK(vtype, vsrc1, arg1); \
177     NEON_UNPACK(vtype, vsrc2, arg2); \
178     NEON_PDO##n; \
179     NEON_PACK(vtype, res, vdest); \
180     return res; \
181 }
182 
183 /* Unary operators.  */
184 #define NEON_VOP1(name, vtype, n) \
185 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
186 { \
187     vtype vsrc1; \
188     vtype vdest; \
189     NEON_UNPACK(vtype, vsrc1, arg); \
190     NEON_DO##n; \
191     NEON_PACK(vtype, arg, vdest); \
192     return arg; \
193 }
194 
195 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
196 NEON_POP(pmin_s8, neon_s8, 4)
197 NEON_POP(pmin_u8, neon_u8, 4)
198 NEON_POP(pmin_s16, neon_s16, 2)
199 NEON_POP(pmin_u16, neon_u16, 2)
200 #undef NEON_FN
201 
202 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
203 NEON_POP(pmax_s8, neon_s8, 4)
204 NEON_POP(pmax_u8, neon_u8, 4)
205 NEON_POP(pmax_s16, neon_s16, 2)
206 NEON_POP(pmax_u16, neon_u16, 2)
207 #undef NEON_FN
208 
209 #define NEON_FN(dest, src1, src2) \
210     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
211 NEON_VOP(shl_u16, neon_u16, 2)
212 #undef NEON_FN
213 
214 #define NEON_FN(dest, src1, src2) \
215     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
216 NEON_VOP(shl_s16, neon_s16, 2)
217 #undef NEON_FN
218 
219 #define NEON_FN(dest, src1, src2) \
220     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
221 NEON_VOP(rshl_s8, neon_s8, 4)
222 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
223 #undef NEON_FN
224 
225 #define NEON_FN(dest, src1, src2) \
226     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
227 NEON_VOP(rshl_s16, neon_s16, 2)
228 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
229 #undef NEON_FN
230 
231 #define NEON_FN(dest, src1, src2) \
232     (dest = do_sqrshl_bhs(src1, src2, 16, true, NULL))
233 NEON_GVEC_VOP2(sme2_srshl_h, int16_t)
234 #undef NEON_FN
235 
236 #define NEON_FN(dest, src1, src2) \
237     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
238 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
239 #undef NEON_FN
240 
241 #define NEON_FN(dest, src1, src2) \
242     (dest = do_sqrshl_bhs(src1, src2, 32, true, NULL))
243 NEON_GVEC_VOP2(sme2_srshl_s, int32_t)
244 #undef NEON_FN
245 
246 #define NEON_FN(dest, src1, src2) \
247     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
248 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
249 #undef NEON_FN
250 
251 #define NEON_FN(dest, src1, src2) \
252     (dest = do_sqrshl_d(src1, src2, true, NULL))
253 NEON_GVEC_VOP2(sme2_srshl_d, int64_t)
254 #undef NEON_FN
255 
256 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
257 {
258     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
259 }
260 
261 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
262 {
263     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
264 }
265 
266 #define NEON_FN(dest, src1, src2) \
267     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
268 NEON_VOP(rshl_u8, neon_u8, 4)
269 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
270 #undef NEON_FN
271 
272 #define NEON_FN(dest, src1, src2) \
273     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
274 NEON_VOP(rshl_u16, neon_u16, 2)
275 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
276 #undef NEON_FN
277 
278 #define NEON_FN(dest, src1, src2) \
279     (dest = do_uqrshl_bhs(src1, (int16_t)src2, 16, true, NULL))
280 NEON_GVEC_VOP2(sme2_urshl_h, uint16_t)
281 #undef NEON_FN
282 
283 #define NEON_FN(dest, src1, src2) \
284     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
285 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
286 #undef NEON_FN
287 
288 #define NEON_FN(dest, src1, src2) \
289     (dest = do_uqrshl_bhs(src1, src2, 32, true, NULL))
290 NEON_GVEC_VOP2(sme2_urshl_s, int32_t)
291 #undef NEON_FN
292 
293 #define NEON_FN(dest, src1, src2) \
294     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
295 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
296 #undef NEON_FN
297 
298 #define NEON_FN(dest, src1, src2) \
299     (dest = do_uqrshl_d(src1, src2, true, NULL))
300 NEON_GVEC_VOP2(sme2_urshl_d, int64_t)
301 #undef NEON_FN
302 
303 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
304 {
305     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
306 }
307 
308 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
309 {
310     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
311 }
312 
313 #define NEON_FN(dest, src1, src2) \
314     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
315 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
316 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
317 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
318 #undef NEON_FN
319 
320 #define NEON_FN(dest, src1, src2) \
321     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
322 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
323 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
324 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
325 #undef NEON_FN
326 
327 #define NEON_FN(dest, src1, src2) \
328     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
329 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
330 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
331 #undef NEON_FN
332 
333 #define NEON_FN(dest, src1, src2) \
334     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
335 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
336 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
337 #undef NEON_FN
338 
339 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
340 {
341     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
342 }
343 
344 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
345 {
346     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
347 }
348 
349 #define NEON_FN(dest, src1, src2) \
350     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
351 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
352 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
353 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
354 #undef NEON_FN
355 
356 #define NEON_FN(dest, src1, src2) \
357     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
358 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
359 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
360 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
361 #undef NEON_FN
362 
363 #define NEON_FN(dest, src1, src2) \
364     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
365 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
366 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
367 #undef NEON_FN
368 
369 #define NEON_FN(dest, src1, src2) \
370     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
371 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
372 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
373 #undef NEON_FN
374 
375 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
376 {
377     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
378 }
379 
380 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
381 {
382     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
383 }
384 
385 #define NEON_FN(dest, src1, src2) \
386     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
387 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
388 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
389 #undef NEON_FN
390 
391 #define NEON_FN(dest, src1, src2) \
392     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
393 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
394 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
395 #undef NEON_FN
396 
397 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
398 {
399     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
400 }
401 
402 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
403 {
404     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
405 }
406 
407 #define NEON_FN(dest, src1, src2) \
408     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
409 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
410 #undef NEON_FN
411 
412 #define NEON_FN(dest, src1, src2) \
413     (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
414 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
415 #undef NEON_FN
416 
417 #define NEON_FN(dest, src1, src2) \
418     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
419 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
420 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
421 #undef NEON_FN
422 
423 #define NEON_FN(dest, src1, src2) \
424     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
425 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
426 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
427 #undef NEON_FN
428 
429 #define NEON_FN(dest, src1, src2) \
430     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
431 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
432 #undef NEON_FN
433 
434 #define NEON_FN(dest, src1, src2) \
435     (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
436 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
437 #undef NEON_FN
438 
439 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
440 {
441     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
442 }
443 
444 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
445 {
446     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
447 }
448 
449 #define NEON_FN(dest, src1, src2) \
450     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
451 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
452 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
453 #undef NEON_FN
454 
455 #define NEON_FN(dest, src1, src2) \
456     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
457 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
458 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
459 #undef NEON_FN
460 
461 #define NEON_FN(dest, src1, src2) \
462     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
463 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
464 #undef NEON_FN
465 
466 #define NEON_FN(dest, src1, src2) \
467     (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
468 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
469 #undef NEON_FN
470 
471 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
472 {
473     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
474 }
475 
476 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
477 {
478     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
479 }
480 
481 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
482 {
483     uint32_t mask;
484     mask = (a ^ b) & 0x80808080u;
485     a &= ~0x80808080u;
486     b &= ~0x80808080u;
487     return (a + b) ^ mask;
488 }
489 
490 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
491 {
492     uint32_t mask;
493     mask = (a ^ b) & 0x80008000u;
494     a &= ~0x80008000u;
495     b &= ~0x80008000u;
496     return (a + b) ^ mask;
497 }
498 
499 #define NEON_FN(dest, src1, src2) dest = src1 - src2
500 NEON_VOP(sub_u8, neon_u8, 4)
501 NEON_VOP(sub_u16, neon_u16, 2)
502 #undef NEON_FN
503 
504 #define NEON_FN(dest, src1, src2) dest = src1 * src2
505 NEON_VOP(mul_u8, neon_u8, 4)
506 NEON_VOP(mul_u16, neon_u16, 2)
507 #undef NEON_FN
508 
509 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
510 NEON_VOP(tst_u8, neon_u8, 4)
511 NEON_VOP(tst_u16, neon_u16, 2)
512 NEON_VOP(tst_u32, neon_u32, 1)
513 #undef NEON_FN
514 
515 /* Count Leading Sign/Zero Bits.  */
516 static inline int do_clz8(uint8_t x)
517 {
518     int n;
519     for (n = 8; x; n--)
520         x >>= 1;
521     return n;
522 }
523 
524 static inline int do_clz16(uint16_t x)
525 {
526     int n;
527     for (n = 16; x; n--)
528         x >>= 1;
529     return n;
530 }
531 
532 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
533 NEON_VOP1(clz_u8, neon_u8, 4)
534 #undef NEON_FN
535 
536 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
537 NEON_VOP1(clz_u16, neon_u16, 2)
538 #undef NEON_FN
539 
540 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
541 NEON_VOP1(cls_s8, neon_s8, 4)
542 #undef NEON_FN
543 
544 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
545 NEON_VOP1(cls_s16, neon_s16, 2)
546 #undef NEON_FN
547 
548 uint32_t HELPER(neon_cls_s32)(uint32_t x)
549 {
550     int count;
551     if ((int32_t)x < 0)
552         x = ~x;
553     for (count = 32; x; count--)
554         x = x >> 1;
555     return count - 1;
556 }
557 
558 #define NEON_QDMULH16(dest, src1, src2, round) do { \
559     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
560     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
561         SET_QC(); \
562         tmp = (tmp >> 31) ^ ~SIGNBIT; \
563     } else { \
564         tmp <<= 1; \
565     } \
566     if (round) { \
567         int32_t old = tmp; \
568         tmp += 1 << 15; \
569         if ((int32_t)tmp < old) { \
570             SET_QC(); \
571             tmp = SIGNBIT - 1; \
572         } \
573     } \
574     dest = tmp >> 16; \
575     } while(0)
576 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
577 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
578 #undef NEON_FN
579 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
580 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
581 #undef NEON_FN
582 #undef NEON_QDMULH16
583 
584 #define NEON_QDMULH32(dest, src1, src2, round) do { \
585     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
586     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
587         SET_QC(); \
588         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
589     } else { \
590         tmp <<= 1; \
591     } \
592     if (round) { \
593         int64_t old = tmp; \
594         tmp += (int64_t)1 << 31; \
595         if ((int64_t)tmp < old) { \
596             SET_QC(); \
597             tmp = SIGNBIT64 - 1; \
598         } \
599     } \
600     dest = tmp >> 32; \
601     } while(0)
602 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
603 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
604 #undef NEON_FN
605 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
606 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
607 #undef NEON_FN
608 #undef NEON_QDMULH32
609 
610 /* Only the low 32-bits of output are significant. */
611 uint64_t HELPER(neon_narrow_u8)(uint64_t x)
612 {
613     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
614            | ((x >> 24) & 0xff000000u);
615 }
616 
617 /* Only the low 32-bits of output are significant. */
618 uint64_t HELPER(neon_narrow_u16)(uint64_t x)
619 {
620     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
621 }
622 
623 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
624 {
625     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
626             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
627 }
628 
629 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
630 {
631     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
632 }
633 
634 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
635 {
636     x &= 0xff80ff80ff80ff80ull;
637     x += 0x0080008000800080ull;
638     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
639             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
640 }
641 
642 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
643 {
644     x &= 0xffff8000ffff8000ull;
645     x += 0x0000800000008000ull;
646     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
647 }
648 
649 /* Only the low 32-bits of output are significant. */
650 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
651 {
652     uint16_t s;
653     uint8_t d;
654     uint32_t res = 0;
655 #define SAT8(n) \
656     s = x >> n; \
657     if (s & 0x8000) { \
658         SET_QC(); \
659     } else { \
660         if (s > 0xff) { \
661             d = 0xff; \
662             SET_QC(); \
663         } else  { \
664             d = s; \
665         } \
666         res |= (uint32_t)d << (n / 2); \
667     }
668 
669     SAT8(0);
670     SAT8(16);
671     SAT8(32);
672     SAT8(48);
673 #undef SAT8
674     return res;
675 }
676 
677 /* Only the low 32-bits of output are significant. */
678 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
679 {
680     uint16_t s;
681     uint8_t d;
682     uint32_t res = 0;
683 #define SAT8(n) \
684     s = x >> n; \
685     if (s > 0xff) { \
686         d = 0xff; \
687         SET_QC(); \
688     } else  { \
689         d = s; \
690     } \
691     res |= (uint32_t)d << (n / 2);
692 
693     SAT8(0);
694     SAT8(16);
695     SAT8(32);
696     SAT8(48);
697 #undef SAT8
698     return res;
699 }
700 
701 /* Only the low 32-bits of output are significant. */
702 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
703 {
704     int16_t s;
705     uint8_t d;
706     uint32_t res = 0;
707 #define SAT8(n) \
708     s = x >> n; \
709     if (s != (int8_t)s) { \
710         d = (s >> 15) ^ 0x7f; \
711         SET_QC(); \
712     } else  { \
713         d = s; \
714     } \
715     res |= (uint32_t)d << (n / 2);
716 
717     SAT8(0);
718     SAT8(16);
719     SAT8(32);
720     SAT8(48);
721 #undef SAT8
722     return res;
723 }
724 
725 /* Only the low 32-bits of output are significant. */
726 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
727 {
728     uint32_t high;
729     uint32_t low;
730     low = x;
731     if (low & 0x80000000) {
732         low = 0;
733         SET_QC();
734     } else if (low > 0xffff) {
735         low = 0xffff;
736         SET_QC();
737     }
738     high = x >> 32;
739     if (high & 0x80000000) {
740         high = 0;
741         SET_QC();
742     } else if (high > 0xffff) {
743         high = 0xffff;
744         SET_QC();
745     }
746     return deposit32(low, 16, 16, high);
747 }
748 
749 /* Only the low 32-bits of output are significant. */
750 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
751 {
752     uint32_t high;
753     uint32_t low;
754     low = x;
755     if (low > 0xffff) {
756         low = 0xffff;
757         SET_QC();
758     }
759     high = x >> 32;
760     if (high > 0xffff) {
761         high = 0xffff;
762         SET_QC();
763     }
764     return deposit32(low, 16, 16, high);
765 }
766 
767 /* Only the low 32-bits of output are significant. */
768 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
769 {
770     int32_t low;
771     int32_t high;
772     low = x;
773     if (low != (int16_t)low) {
774         low = (low >> 31) ^ 0x7fff;
775         SET_QC();
776     }
777     high = x >> 32;
778     if (high != (int16_t)high) {
779         high = (high >> 31) ^ 0x7fff;
780         SET_QC();
781     }
782     return deposit32(low, 16, 16, high);
783 }
784 
785 /* Only the low 32-bits of output are significant. */
786 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
787 {
788     if (x & 0x8000000000000000ull) {
789         SET_QC();
790         return 0;
791     }
792     if (x > 0xffffffffu) {
793         SET_QC();
794         return 0xffffffffu;
795     }
796     return x;
797 }
798 
799 /* Only the low 32-bits of output are significant. */
800 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
801 {
802     if (x > 0xffffffffu) {
803         SET_QC();
804         return 0xffffffffu;
805     }
806     return x;
807 }
808 
809 /* Only the low 32-bits of output are significant. */
810 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
811 {
812     if ((int64_t)x != (int32_t)x) {
813         SET_QC();
814         return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff;
815     }
816     return (uint32_t)x;
817 }
818 
819 uint64_t HELPER(neon_widen_u8)(uint32_t x)
820 {
821     uint64_t tmp;
822     uint64_t ret;
823     ret = (uint8_t)x;
824     tmp = (uint8_t)(x >> 8);
825     ret |= tmp << 16;
826     tmp = (uint8_t)(x >> 16);
827     ret |= tmp << 32;
828     tmp = (uint8_t)(x >> 24);
829     ret |= tmp << 48;
830     return ret;
831 }
832 
833 uint64_t HELPER(neon_widen_s8)(uint32_t x)
834 {
835     uint64_t tmp;
836     uint64_t ret;
837     ret = (uint16_t)(int8_t)x;
838     tmp = (uint16_t)(int8_t)(x >> 8);
839     ret |= tmp << 16;
840     tmp = (uint16_t)(int8_t)(x >> 16);
841     ret |= tmp << 32;
842     tmp = (uint16_t)(int8_t)(x >> 24);
843     ret |= tmp << 48;
844     return ret;
845 }
846 
847 uint64_t HELPER(neon_widen_u16)(uint32_t x)
848 {
849     uint64_t high = (uint16_t)(x >> 16);
850     return ((uint16_t)x) | (high << 32);
851 }
852 
853 uint64_t HELPER(neon_widen_s16)(uint32_t x)
854 {
855     uint64_t high = (int16_t)(x >> 16);
856     return ((uint32_t)(int16_t)x) | (high << 32);
857 }
858 
859 /* Pairwise long add: add pairs of adjacent elements into
860  * double-width elements in the result (eg _s8 is an 8x8->16 op)
861  */
862 uint64_t HELPER(neon_addlp_s8)(uint64_t a)
863 {
864     uint64_t nsignmask = 0x0080008000800080ULL;
865     uint64_t wsignmask = 0x8000800080008000ULL;
866     uint64_t elementmask = 0x00ff00ff00ff00ffULL;
867     uint64_t tmp1, tmp2;
868     uint64_t res, signres;
869 
870     /* Extract odd elements, sign extend each to a 16 bit field */
871     tmp1 = a & elementmask;
872     tmp1 ^= nsignmask;
873     tmp1 |= wsignmask;
874     tmp1 = (tmp1 - nsignmask) ^ wsignmask;
875     /* Ditto for the even elements */
876     tmp2 = (a >> 8) & elementmask;
877     tmp2 ^= nsignmask;
878     tmp2 |= wsignmask;
879     tmp2 = (tmp2 - nsignmask) ^ wsignmask;
880 
881     /* calculate the result by summing bits 0..14, 16..22, etc,
882      * and then adjusting the sign bits 15, 23, etc manually.
883      * This ensures the addition can't overflow the 16 bit field.
884      */
885     signres = (tmp1 ^ tmp2) & wsignmask;
886     res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
887     res ^= signres;
888 
889     return res;
890 }
891 
892 uint64_t HELPER(neon_addlp_s16)(uint64_t a)
893 {
894     int32_t reslo, reshi;
895 
896     reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
897     reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
898 
899     return (uint32_t)reslo | (((uint64_t)reshi) << 32);
900 }
901 
902 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
903 {
904     uint32_t x, y;
905     uint32_t low, high;
906 
907     x = a;
908     y = b;
909     low = x + y;
910     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
911         SET_QC();
912         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
913     }
914     x = a >> 32;
915     y = b >> 32;
916     high = x + y;
917     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
918         SET_QC();
919         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
920     }
921     return low | ((uint64_t)high << 32);
922 }
923 
924 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
925 {
926     uint64_t result;
927 
928     result = a + b;
929     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
930         SET_QC();
931         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
932     }
933     return result;
934 }
935 
936 /* We have to do the arithmetic in a larger type than
937  * the input type, because for example with a signed 32 bit
938  * op the absolute difference can overflow a signed 32 bit value.
939  */
940 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
941     arithtype tmp_x = (intype)(x);                            \
942     arithtype tmp_y = (intype)(y);                            \
943     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
944     } while(0)
945 
946 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
947 {
948     uint64_t tmp;
949     uint64_t result;
950     DO_ABD(result, a, b, uint8_t, uint32_t);
951     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
952     result |= tmp << 16;
953     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
954     result |= tmp << 32;
955     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
956     result |= tmp << 48;
957     return result;
958 }
959 
960 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
961 {
962     uint64_t tmp;
963     uint64_t result;
964     DO_ABD(result, a, b, int8_t, int32_t);
965     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
966     result |= tmp << 16;
967     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
968     result |= tmp << 32;
969     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
970     result |= tmp << 48;
971     return result;
972 }
973 
974 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
975 {
976     uint64_t tmp;
977     uint64_t result;
978     DO_ABD(result, a, b, uint16_t, uint32_t);
979     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
980     return result | (tmp << 32);
981 }
982 
983 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
984 {
985     uint64_t tmp;
986     uint64_t result;
987     DO_ABD(result, a, b, int16_t, int32_t);
988     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
989     return result | (tmp << 32);
990 }
991 
992 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
993 {
994     uint64_t result;
995     DO_ABD(result, a, b, uint32_t, uint64_t);
996     return result;
997 }
998 
999 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1000 {
1001     uint64_t result;
1002     DO_ABD(result, a, b, int32_t, int64_t);
1003     return result;
1004 }
1005 #undef DO_ABD
1006 
1007 /* Widening multiply. Named type is the source type.  */
1008 #define DO_MULL(dest, x, y, type1, type2) do { \
1009     type1 tmp_x = x; \
1010     type1 tmp_y = y; \
1011     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1012     } while(0)
1013 
1014 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1015 {
1016     uint64_t tmp;
1017     uint64_t result;
1018 
1019     DO_MULL(result, a, b, uint8_t, uint16_t);
1020     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1021     result |= tmp << 16;
1022     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1023     result |= tmp << 32;
1024     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1025     result |= tmp << 48;
1026     return result;
1027 }
1028 
1029 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1030 {
1031     uint64_t tmp;
1032     uint64_t result;
1033 
1034     DO_MULL(result, a, b, int8_t, uint16_t);
1035     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1036     result |= tmp << 16;
1037     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1038     result |= tmp << 32;
1039     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1040     result |= tmp << 48;
1041     return result;
1042 }
1043 
1044 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1045 {
1046     uint64_t tmp;
1047     uint64_t result;
1048 
1049     DO_MULL(result, a, b, uint16_t, uint32_t);
1050     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1051     return result | (tmp << 32);
1052 }
1053 
1054 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1055 {
1056     uint64_t tmp;
1057     uint64_t result;
1058 
1059     DO_MULL(result, a, b, int16_t, uint32_t);
1060     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1061     return result | (tmp << 32);
1062 }
1063 
1064 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1065 {
1066     uint16_t tmp;
1067     uint64_t result;
1068     result = (uint16_t)-x;
1069     tmp = -(x >> 16);
1070     result |= (uint64_t)tmp << 16;
1071     tmp = -(x >> 32);
1072     result |= (uint64_t)tmp << 32;
1073     tmp = -(x >> 48);
1074     result |= (uint64_t)tmp << 48;
1075     return result;
1076 }
1077 
1078 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1079 {
1080     uint32_t low = -x;
1081     uint32_t high = -(x >> 32);
1082     return low | ((uint64_t)high << 32);
1083 }
1084 
1085 /* Saturating sign manipulation.  */
1086 /* ??? Make these use NEON_VOP1 */
1087 #define DO_QABS8(x) do { \
1088     if (x == (int8_t)0x80) { \
1089         x = 0x7f; \
1090         SET_QC(); \
1091     } else if (x < 0) { \
1092         x = -x; \
1093     }} while (0)
1094 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1095 {
1096     neon_s8 vec;
1097     NEON_UNPACK(neon_s8, vec, x);
1098     DO_QABS8(vec.v1);
1099     DO_QABS8(vec.v2);
1100     DO_QABS8(vec.v3);
1101     DO_QABS8(vec.v4);
1102     NEON_PACK(neon_s8, x, vec);
1103     return x;
1104 }
1105 #undef DO_QABS8
1106 
1107 #define DO_QNEG8(x) do { \
1108     if (x == (int8_t)0x80) { \
1109         x = 0x7f; \
1110         SET_QC(); \
1111     } else { \
1112         x = -x; \
1113     }} while (0)
1114 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1115 {
1116     neon_s8 vec;
1117     NEON_UNPACK(neon_s8, vec, x);
1118     DO_QNEG8(vec.v1);
1119     DO_QNEG8(vec.v2);
1120     DO_QNEG8(vec.v3);
1121     DO_QNEG8(vec.v4);
1122     NEON_PACK(neon_s8, x, vec);
1123     return x;
1124 }
1125 #undef DO_QNEG8
1126 
1127 #define DO_QABS16(x) do { \
1128     if (x == (int16_t)0x8000) { \
1129         x = 0x7fff; \
1130         SET_QC(); \
1131     } else if (x < 0) { \
1132         x = -x; \
1133     }} while (0)
1134 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1135 {
1136     neon_s16 vec;
1137     NEON_UNPACK(neon_s16, vec, x);
1138     DO_QABS16(vec.v1);
1139     DO_QABS16(vec.v2);
1140     NEON_PACK(neon_s16, x, vec);
1141     return x;
1142 }
1143 #undef DO_QABS16
1144 
1145 #define DO_QNEG16(x) do { \
1146     if (x == (int16_t)0x8000) { \
1147         x = 0x7fff; \
1148         SET_QC(); \
1149     } else { \
1150         x = -x; \
1151     }} while (0)
1152 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1153 {
1154     neon_s16 vec;
1155     NEON_UNPACK(neon_s16, vec, x);
1156     DO_QNEG16(vec.v1);
1157     DO_QNEG16(vec.v2);
1158     NEON_PACK(neon_s16, x, vec);
1159     return x;
1160 }
1161 #undef DO_QNEG16
1162 
1163 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1164 {
1165     if (x == SIGNBIT) {
1166         SET_QC();
1167         x = ~SIGNBIT;
1168     } else if ((int32_t)x < 0) {
1169         x = -x;
1170     }
1171     return x;
1172 }
1173 
1174 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1175 {
1176     if (x == SIGNBIT) {
1177         SET_QC();
1178         x = ~SIGNBIT;
1179     } else {
1180         x = -x;
1181     }
1182     return x;
1183 }
1184 
1185 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1186 {
1187     if (x == SIGNBIT64) {
1188         SET_QC();
1189         x = ~SIGNBIT64;
1190     } else if ((int64_t)x < 0) {
1191         x = -x;
1192     }
1193     return x;
1194 }
1195 
1196 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1197 {
1198     if (x == SIGNBIT64) {
1199         SET_QC();
1200         x = ~SIGNBIT64;
1201     } else {
1202         x = -x;
1203     }
1204     return x;
1205 }
1206 
1207 /* NEON Float helpers.  */
1208 
1209 /* Floating point comparisons produce an integer result.
1210  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1211  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1212  */
1213 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, float_status *fpst)
1214 {
1215     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1216 }
1217 
1218 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1219 {
1220     return -float32_le(make_float32(b), make_float32(a), fpst);
1221 }
1222 
1223 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1224 {
1225     return -float32_lt(make_float32(b), make_float32(a), fpst);
1226 }
1227 
1228 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1229 {
1230     float32 f0 = float32_abs(make_float32(a));
1231     float32 f1 = float32_abs(make_float32(b));
1232     return -float32_le(f1, f0, fpst);
1233 }
1234 
1235 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1236 {
1237     float32 f0 = float32_abs(make_float32(a));
1238     float32 f1 = float32_abs(make_float32(b));
1239     return -float32_lt(f1, f0, fpst);
1240 }
1241 
1242 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, float_status *fpst)
1243 {
1244     float64 f0 = float64_abs(make_float64(a));
1245     float64 f1 = float64_abs(make_float64(b));
1246     return -float64_le(f1, f0, fpst);
1247 }
1248 
1249 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, float_status *fpst)
1250 {
1251     float64 f0 = float64_abs(make_float64(a));
1252     float64 f1 = float64_abs(make_float64(b));
1253     return -float64_lt(f1, f0, fpst);
1254 }
1255 
1256 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1257 
1258 void HELPER(neon_qunzip8)(void *vd, void *vm)
1259 {
1260     uint64_t *rd = vd, *rm = vm;
1261     uint64_t zd0 = rd[0], zd1 = rd[1];
1262     uint64_t zm0 = rm[0], zm1 = rm[1];
1263 
1264     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1265         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1266         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1267         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1268     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1269         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1270         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1271         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1272     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1273         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1274         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1275         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1276     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1277         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1278         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1279         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1280 
1281     rm[0] = m0;
1282     rm[1] = m1;
1283     rd[0] = d0;
1284     rd[1] = d1;
1285 }
1286 
1287 void HELPER(neon_qunzip16)(void *vd, void *vm)
1288 {
1289     uint64_t *rd = vd, *rm = vm;
1290     uint64_t zd0 = rd[0], zd1 = rd[1];
1291     uint64_t zm0 = rm[0], zm1 = rm[1];
1292 
1293     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1294         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1295     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1296         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1297     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1298         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1299     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1300         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1301 
1302     rm[0] = m0;
1303     rm[1] = m1;
1304     rd[0] = d0;
1305     rd[1] = d1;
1306 }
1307 
1308 void HELPER(neon_qunzip32)(void *vd, void *vm)
1309 {
1310     uint64_t *rd = vd, *rm = vm;
1311     uint64_t zd0 = rd[0], zd1 = rd[1];
1312     uint64_t zm0 = rm[0], zm1 = rm[1];
1313 
1314     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1315     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1316     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1317     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1318 
1319     rm[0] = m0;
1320     rm[1] = m1;
1321     rd[0] = d0;
1322     rd[1] = d1;
1323 }
1324 
1325 void HELPER(neon_unzip8)(void *vd, void *vm)
1326 {
1327     uint64_t *rd = vd, *rm = vm;
1328     uint64_t zd = rd[0], zm = rm[0];
1329 
1330     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1331         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1332         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1333         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1334     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1335         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1336         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1337         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1338 
1339     rm[0] = m0;
1340     rd[0] = d0;
1341 }
1342 
1343 void HELPER(neon_unzip16)(void *vd, void *vm)
1344 {
1345     uint64_t *rd = vd, *rm = vm;
1346     uint64_t zd = rd[0], zm = rm[0];
1347 
1348     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1349         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1350     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1351         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1352 
1353     rm[0] = m0;
1354     rd[0] = d0;
1355 }
1356 
1357 void HELPER(neon_qzip8)(void *vd, void *vm)
1358 {
1359     uint64_t *rd = vd, *rm = vm;
1360     uint64_t zd0 = rd[0], zd1 = rd[1];
1361     uint64_t zm0 = rm[0], zm1 = rm[1];
1362 
1363     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1364         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1365         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1366         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1367     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1368         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1369         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1370         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1371     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1372         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1373         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1374         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1375     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1376         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1377         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1378         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1379 
1380     rm[0] = m0;
1381     rm[1] = m1;
1382     rd[0] = d0;
1383     rd[1] = d1;
1384 }
1385 
1386 void HELPER(neon_qzip16)(void *vd, void *vm)
1387 {
1388     uint64_t *rd = vd, *rm = vm;
1389     uint64_t zd0 = rd[0], zd1 = rd[1];
1390     uint64_t zm0 = rm[0], zm1 = rm[1];
1391 
1392     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1393         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1394     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1395         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1396     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1397         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1398     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1399         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1400 
1401     rm[0] = m0;
1402     rm[1] = m1;
1403     rd[0] = d0;
1404     rd[1] = d1;
1405 }
1406 
1407 void HELPER(neon_qzip32)(void *vd, void *vm)
1408 {
1409     uint64_t *rd = vd, *rm = vm;
1410     uint64_t zd0 = rd[0], zd1 = rd[1];
1411     uint64_t zm0 = rm[0], zm1 = rm[1];
1412 
1413     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1414     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1415     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1416     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1417 
1418     rm[0] = m0;
1419     rm[1] = m1;
1420     rd[0] = d0;
1421     rd[1] = d1;
1422 }
1423 
1424 void HELPER(neon_zip8)(void *vd, void *vm)
1425 {
1426     uint64_t *rd = vd, *rm = vm;
1427     uint64_t zd = rd[0], zm = rm[0];
1428 
1429     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1430         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1431         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1432         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1433     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1434         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1435         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1436         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1437 
1438     rm[0] = m0;
1439     rd[0] = d0;
1440 }
1441 
1442 void HELPER(neon_zip16)(void *vd, void *vm)
1443 {
1444     uint64_t *rd = vd, *rm = vm;
1445     uint64_t zd = rd[0], zm = rm[0];
1446 
1447     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1448         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1449     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1450         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1451 
1452     rm[0] = m0;
1453     rd[0] = d0;
1454 }
1455