xref: /qemu/target/arm/tcg/neon_helper.c (revision 513823e7521a09ed7ad1e32e6454bac3b2cbf52d)
1 /*
2  * ARM NEON vector operations.
3  *
4  * Copyright (c) 2007, 2008 CodeSourcery.
5  * Written by Paul Brook
6  *
7  * This code is licensed under the GNU GPL v2.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "tcg/tcg-gvec-desc.h"
14 #include "fpu/softfloat.h"
15 #include "vec_internal.h"
16 
17 #define SIGNBIT (uint32_t)0x80000000
18 #define SIGNBIT64 ((uint64_t)1 << 63)
19 
20 #define SET_QC() env->vfp.qc[0] = 1
21 
22 #define NEON_TYPE1(name, type) \
23 typedef struct \
24 { \
25     type v1; \
26 } neon_##name;
27 #if HOST_BIG_ENDIAN
28 #define NEON_TYPE2(name, type) \
29 typedef struct \
30 { \
31     type v2; \
32     type v1; \
33 } neon_##name;
34 #define NEON_TYPE4(name, type) \
35 typedef struct \
36 { \
37     type v4; \
38     type v3; \
39     type v2; \
40     type v1; \
41 } neon_##name;
42 #else
43 #define NEON_TYPE2(name, type) \
44 typedef struct \
45 { \
46     type v1; \
47     type v2; \
48 } neon_##name;
49 #define NEON_TYPE4(name, type) \
50 typedef struct \
51 { \
52     type v1; \
53     type v2; \
54     type v3; \
55     type v4; \
56 } neon_##name;
57 #endif
58 
59 NEON_TYPE4(s8, int8_t)
60 NEON_TYPE4(u8, uint8_t)
61 NEON_TYPE2(s16, int16_t)
62 NEON_TYPE2(u16, uint16_t)
63 NEON_TYPE1(s32, int32_t)
64 NEON_TYPE1(u32, uint32_t)
65 #undef NEON_TYPE4
66 #undef NEON_TYPE2
67 #undef NEON_TYPE1
68 
69 /* Copy from a uint32_t to a vector structure type.  */
70 #define NEON_UNPACK(vtype, dest, val) do { \
71     union { \
72         vtype v; \
73         uint32_t i; \
74     } conv_u; \
75     conv_u.i = (val); \
76     dest = conv_u.v; \
77     } while(0)
78 
79 /* Copy from a vector structure type to a uint32_t.  */
80 #define NEON_PACK(vtype, dest, val) do { \
81     union { \
82         vtype v; \
83         uint32_t i; \
84     } conv_u; \
85     conv_u.v = (val); \
86     dest = conv_u.i; \
87     } while(0)
88 
89 #define NEON_DO1 \
90     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
91 #define NEON_DO2 \
92     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
93     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
94 #define NEON_DO4 \
95     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
96     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
97     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
98     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
99 
100 #define NEON_VOP_BODY(vtype, n) \
101 { \
102     uint32_t res; \
103     vtype vsrc1; \
104     vtype vsrc2; \
105     vtype vdest; \
106     NEON_UNPACK(vtype, vsrc1, arg1); \
107     NEON_UNPACK(vtype, vsrc2, arg2); \
108     NEON_DO##n; \
109     NEON_PACK(vtype, res, vdest); \
110     return res; \
111 }
112 
113 #define NEON_VOP(name, vtype, n) \
114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
115 NEON_VOP_BODY(vtype, n)
116 
117 #define NEON_VOP_ENV(name, vtype, n) \
118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
119 NEON_VOP_BODY(vtype, n)
120 
121 #define NEON_GVEC_VOP2(name, vtype) \
122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
123 {                                                               \
124     intptr_t i, opr_sz = simd_oprsz(desc);                      \
125     vtype *d = vd, *n = vn, *m = vm;                            \
126     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
127         NEON_FN(d[i], n[i], m[i]);                              \
128     }                                                           \
129     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
130 }
131 
132 #define NEON_GVEC_VOP2_ENV(name, vtype) \
133 void HELPER(name)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) \
134 {                                                               \
135     intptr_t i, opr_sz = simd_oprsz(desc);                      \
136     vtype *d = vd, *n = vn, *m = vm;                            \
137     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
138         NEON_FN(d[i], n[i], m[i]);                              \
139     }                                                           \
140     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
141 }
142 
143 #define NEON_GVEC_VOP2i_ENV(name, vtype) \
144 void HELPER(name)(void *vd, void *vn, CPUARMState *env, uint32_t desc) \
145 {                                                               \
146     intptr_t i, opr_sz = simd_oprsz(desc);                      \
147     int imm = simd_data(desc);                                  \
148     vtype *d = vd, *n = vn;                                     \
149     for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
150         NEON_FN(d[i], n[i], imm);                               \
151     }                                                           \
152     clear_tail(d, opr_sz, simd_maxsz(desc));                    \
153 }
154 
155 /* Pairwise operations.  */
156 /* For 32-bit elements each segment only contains a single element, so
157    the elementwise and pairwise operations are the same.  */
158 #define NEON_PDO2 \
159     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
160     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
161 #define NEON_PDO4 \
162     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
163     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
164     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
165     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
166 
167 #define NEON_POP(name, vtype, n) \
168 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
169 { \
170     uint32_t res; \
171     vtype vsrc1; \
172     vtype vsrc2; \
173     vtype vdest; \
174     NEON_UNPACK(vtype, vsrc1, arg1); \
175     NEON_UNPACK(vtype, vsrc2, arg2); \
176     NEON_PDO##n; \
177     NEON_PACK(vtype, res, vdest); \
178     return res; \
179 }
180 
181 /* Unary operators.  */
182 #define NEON_VOP1(name, vtype, n) \
183 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
184 { \
185     vtype vsrc1; \
186     vtype vdest; \
187     NEON_UNPACK(vtype, vsrc1, arg); \
188     NEON_DO##n; \
189     NEON_PACK(vtype, arg, vdest); \
190     return arg; \
191 }
192 
193 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
194 NEON_POP(pmin_s8, neon_s8, 4)
195 NEON_POP(pmin_u8, neon_u8, 4)
196 NEON_POP(pmin_s16, neon_s16, 2)
197 NEON_POP(pmin_u16, neon_u16, 2)
198 #undef NEON_FN
199 
200 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
201 NEON_POP(pmax_s8, neon_s8, 4)
202 NEON_POP(pmax_u8, neon_u8, 4)
203 NEON_POP(pmax_s16, neon_s16, 2)
204 NEON_POP(pmax_u16, neon_u16, 2)
205 #undef NEON_FN
206 
207 #define NEON_FN(dest, src1, src2) \
208     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
209 NEON_VOP(shl_u16, neon_u16, 2)
210 #undef NEON_FN
211 
212 #define NEON_FN(dest, src1, src2) \
213     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
214 NEON_VOP(shl_s16, neon_s16, 2)
215 #undef NEON_FN
216 
217 #define NEON_FN(dest, src1, src2) \
218     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
219 NEON_VOP(rshl_s8, neon_s8, 4)
220 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
221 #undef NEON_FN
222 
223 #define NEON_FN(dest, src1, src2) \
224     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
225 NEON_VOP(rshl_s16, neon_s16, 2)
226 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
227 #undef NEON_FN
228 
229 #define NEON_FN(dest, src1, src2) \
230     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
231 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
232 #undef NEON_FN
233 
234 #define NEON_FN(dest, src1, src2) \
235     (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
236 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
237 #undef NEON_FN
238 
239 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
240 {
241     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
242 }
243 
244 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
245 {
246     return do_sqrshl_d(val, (int8_t)shift, true, NULL);
247 }
248 
249 #define NEON_FN(dest, src1, src2) \
250     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
251 NEON_VOP(rshl_u8, neon_u8, 4)
252 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
253 #undef NEON_FN
254 
255 #define NEON_FN(dest, src1, src2) \
256     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
257 NEON_VOP(rshl_u16, neon_u16, 2)
258 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
259 #undef NEON_FN
260 
261 #define NEON_FN(dest, src1, src2) \
262     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
263 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
264 #undef NEON_FN
265 
266 #define NEON_FN(dest, src1, src2) \
267     (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
268 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
269 #undef NEON_FN
270 
271 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
272 {
273     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
274 }
275 
276 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
277 {
278     return do_uqrshl_d(val, (int8_t)shift, true, NULL);
279 }
280 
281 #define NEON_FN(dest, src1, src2) \
282     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
283 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
284 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
285 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
286 #undef NEON_FN
287 
288 #define NEON_FN(dest, src1, src2) \
289     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
290 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
291 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
292 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
293 #undef NEON_FN
294 
295 #define NEON_FN(dest, src1, src2) \
296     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
297 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
298 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
299 #undef NEON_FN
300 
301 #define NEON_FN(dest, src1, src2) \
302     (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
303 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
304 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
305 #undef NEON_FN
306 
307 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
308 {
309     return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
310 }
311 
312 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
313 {
314     return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
315 }
316 
317 #define NEON_FN(dest, src1, src2) \
318     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
319 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
320 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
321 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
322 #undef NEON_FN
323 
324 #define NEON_FN(dest, src1, src2) \
325     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
326 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
327 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
328 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
329 #undef NEON_FN
330 
331 #define NEON_FN(dest, src1, src2) \
332     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
333 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
334 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
335 #undef NEON_FN
336 
337 #define NEON_FN(dest, src1, src2) \
338     (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
339 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
340 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
341 #undef NEON_FN
342 
343 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
344 {
345     return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
346 }
347 
348 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
349 {
350     return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
351 }
352 
353 #define NEON_FN(dest, src1, src2) \
354     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
355 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
356 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
357 #undef NEON_FN
358 
359 #define NEON_FN(dest, src1, src2) \
360     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
361 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
362 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
363 #undef NEON_FN
364 
365 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
366 {
367     return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
368 }
369 
370 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
371 {
372     return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
373 }
374 
375 #define NEON_FN(dest, src1, src2) \
376     (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
377 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
378 #undef NEON_FN
379 
380 #define NEON_FN(dest, src1, src2) \
381     (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
382 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
383 #undef NEON_FN
384 
385 #define NEON_FN(dest, src1, src2) \
386     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
387 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
388 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
389 #undef NEON_FN
390 
391 #define NEON_FN(dest, src1, src2) \
392     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
393 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
394 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
395 #undef NEON_FN
396 
397 #define NEON_FN(dest, src1, src2) \
398     (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
399 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
400 #undef NEON_FN
401 
402 #define NEON_FN(dest, src1, src2) \
403     (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
404 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
405 #undef NEON_FN
406 
407 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
408 {
409     return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
410 }
411 
412 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
413 {
414     return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
415 }
416 
417 #define NEON_FN(dest, src1, src2) \
418     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
419 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
420 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
421 #undef NEON_FN
422 
423 #define NEON_FN(dest, src1, src2) \
424     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
425 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
426 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
427 #undef NEON_FN
428 
429 #define NEON_FN(dest, src1, src2) \
430     (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
431 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
432 #undef NEON_FN
433 
434 #define NEON_FN(dest, src1, src2) \
435     (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
436 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
437 #undef NEON_FN
438 
439 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
440 {
441     return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
442 }
443 
444 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
445 {
446     return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
447 }
448 
449 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
450 {
451     uint32_t mask;
452     mask = (a ^ b) & 0x80808080u;
453     a &= ~0x80808080u;
454     b &= ~0x80808080u;
455     return (a + b) ^ mask;
456 }
457 
458 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
459 {
460     uint32_t mask;
461     mask = (a ^ b) & 0x80008000u;
462     a &= ~0x80008000u;
463     b &= ~0x80008000u;
464     return (a + b) ^ mask;
465 }
466 
467 #define NEON_FN(dest, src1, src2) dest = src1 - src2
468 NEON_VOP(sub_u8, neon_u8, 4)
469 NEON_VOP(sub_u16, neon_u16, 2)
470 #undef NEON_FN
471 
472 #define NEON_FN(dest, src1, src2) dest = src1 * src2
473 NEON_VOP(mul_u8, neon_u8, 4)
474 NEON_VOP(mul_u16, neon_u16, 2)
475 #undef NEON_FN
476 
477 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
478 NEON_VOP(tst_u8, neon_u8, 4)
479 NEON_VOP(tst_u16, neon_u16, 2)
480 NEON_VOP(tst_u32, neon_u32, 1)
481 #undef NEON_FN
482 
483 /* Count Leading Sign/Zero Bits.  */
484 static inline int do_clz8(uint8_t x)
485 {
486     int n;
487     for (n = 8; x; n--)
488         x >>= 1;
489     return n;
490 }
491 
492 static inline int do_clz16(uint16_t x)
493 {
494     int n;
495     for (n = 16; x; n--)
496         x >>= 1;
497     return n;
498 }
499 
500 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
501 NEON_VOP1(clz_u8, neon_u8, 4)
502 #undef NEON_FN
503 
504 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
505 NEON_VOP1(clz_u16, neon_u16, 2)
506 #undef NEON_FN
507 
508 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
509 NEON_VOP1(cls_s8, neon_s8, 4)
510 #undef NEON_FN
511 
512 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
513 NEON_VOP1(cls_s16, neon_s16, 2)
514 #undef NEON_FN
515 
516 uint32_t HELPER(neon_cls_s32)(uint32_t x)
517 {
518     int count;
519     if ((int32_t)x < 0)
520         x = ~x;
521     for (count = 32; x; count--)
522         x = x >> 1;
523     return count - 1;
524 }
525 
526 #define NEON_QDMULH16(dest, src1, src2, round) do { \
527     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
528     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
529         SET_QC(); \
530         tmp = (tmp >> 31) ^ ~SIGNBIT; \
531     } else { \
532         tmp <<= 1; \
533     } \
534     if (round) { \
535         int32_t old = tmp; \
536         tmp += 1 << 15; \
537         if ((int32_t)tmp < old) { \
538             SET_QC(); \
539             tmp = SIGNBIT - 1; \
540         } \
541     } \
542     dest = tmp >> 16; \
543     } while(0)
544 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
545 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
546 #undef NEON_FN
547 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
548 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
549 #undef NEON_FN
550 #undef NEON_QDMULH16
551 
552 #define NEON_QDMULH32(dest, src1, src2, round) do { \
553     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
554     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
555         SET_QC(); \
556         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
557     } else { \
558         tmp <<= 1; \
559     } \
560     if (round) { \
561         int64_t old = tmp; \
562         tmp += (int64_t)1 << 31; \
563         if ((int64_t)tmp < old) { \
564             SET_QC(); \
565             tmp = SIGNBIT64 - 1; \
566         } \
567     } \
568     dest = tmp >> 32; \
569     } while(0)
570 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
571 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
572 #undef NEON_FN
573 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
574 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
575 #undef NEON_FN
576 #undef NEON_QDMULH32
577 
578 /* Only the low 32-bits of output are significant. */
579 uint64_t HELPER(neon_narrow_u8)(uint64_t x)
580 {
581     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
582            | ((x >> 24) & 0xff000000u);
583 }
584 
585 /* Only the low 32-bits of output are significant. */
586 uint64_t HELPER(neon_narrow_u16)(uint64_t x)
587 {
588     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
589 }
590 
591 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
592 {
593     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
594             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
595 }
596 
597 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
598 {
599     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
600 }
601 
602 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
603 {
604     x &= 0xff80ff80ff80ff80ull;
605     x += 0x0080008000800080ull;
606     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
607             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
608 }
609 
610 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
611 {
612     x &= 0xffff8000ffff8000ull;
613     x += 0x0000800000008000ull;
614     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
615 }
616 
617 /* Only the low 32-bits of output are significant. */
618 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
619 {
620     uint16_t s;
621     uint8_t d;
622     uint32_t res = 0;
623 #define SAT8(n) \
624     s = x >> n; \
625     if (s & 0x8000) { \
626         SET_QC(); \
627     } else { \
628         if (s > 0xff) { \
629             d = 0xff; \
630             SET_QC(); \
631         } else  { \
632             d = s; \
633         } \
634         res |= (uint32_t)d << (n / 2); \
635     }
636 
637     SAT8(0);
638     SAT8(16);
639     SAT8(32);
640     SAT8(48);
641 #undef SAT8
642     return res;
643 }
644 
645 /* Only the low 32-bits of output are significant. */
646 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
647 {
648     uint16_t s;
649     uint8_t d;
650     uint32_t res = 0;
651 #define SAT8(n) \
652     s = x >> n; \
653     if (s > 0xff) { \
654         d = 0xff; \
655         SET_QC(); \
656     } else  { \
657         d = s; \
658     } \
659     res |= (uint32_t)d << (n / 2);
660 
661     SAT8(0);
662     SAT8(16);
663     SAT8(32);
664     SAT8(48);
665 #undef SAT8
666     return res;
667 }
668 
669 /* Only the low 32-bits of output are significant. */
670 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
671 {
672     int16_t s;
673     uint8_t d;
674     uint32_t res = 0;
675 #define SAT8(n) \
676     s = x >> n; \
677     if (s != (int8_t)s) { \
678         d = (s >> 15) ^ 0x7f; \
679         SET_QC(); \
680     } else  { \
681         d = s; \
682     } \
683     res |= (uint32_t)d << (n / 2);
684 
685     SAT8(0);
686     SAT8(16);
687     SAT8(32);
688     SAT8(48);
689 #undef SAT8
690     return res;
691 }
692 
693 /* Only the low 32-bits of output are significant. */
694 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
695 {
696     uint32_t high;
697     uint32_t low;
698     low = x;
699     if (low & 0x80000000) {
700         low = 0;
701         SET_QC();
702     } else if (low > 0xffff) {
703         low = 0xffff;
704         SET_QC();
705     }
706     high = x >> 32;
707     if (high & 0x80000000) {
708         high = 0;
709         SET_QC();
710     } else if (high > 0xffff) {
711         high = 0xffff;
712         SET_QC();
713     }
714     return deposit32(low, 16, 16, high);
715 }
716 
717 /* Only the low 32-bits of output are significant. */
718 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
719 {
720     uint32_t high;
721     uint32_t low;
722     low = x;
723     if (low > 0xffff) {
724         low = 0xffff;
725         SET_QC();
726     }
727     high = x >> 32;
728     if (high > 0xffff) {
729         high = 0xffff;
730         SET_QC();
731     }
732     return deposit32(low, 16, 16, high);
733 }
734 
735 /* Only the low 32-bits of output are significant. */
736 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
737 {
738     int32_t low;
739     int32_t high;
740     low = x;
741     if (low != (int16_t)low) {
742         low = (low >> 31) ^ 0x7fff;
743         SET_QC();
744     }
745     high = x >> 32;
746     if (high != (int16_t)high) {
747         high = (high >> 31) ^ 0x7fff;
748         SET_QC();
749     }
750     return deposit32(low, 16, 16, high);
751 }
752 
753 /* Only the low 32-bits of output are significant. */
754 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
755 {
756     if (x & 0x8000000000000000ull) {
757         SET_QC();
758         return 0;
759     }
760     if (x > 0xffffffffu) {
761         SET_QC();
762         return 0xffffffffu;
763     }
764     return x;
765 }
766 
767 /* Only the low 32-bits of output are significant. */
768 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
769 {
770     if (x > 0xffffffffu) {
771         SET_QC();
772         return 0xffffffffu;
773     }
774     return x;
775 }
776 
777 /* Only the low 32-bits of output are significant. */
778 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
779 {
780     if ((int64_t)x != (int32_t)x) {
781         SET_QC();
782         return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff;
783     }
784     return (uint32_t)x;
785 }
786 
787 uint64_t HELPER(neon_widen_u8)(uint32_t x)
788 {
789     uint64_t tmp;
790     uint64_t ret;
791     ret = (uint8_t)x;
792     tmp = (uint8_t)(x >> 8);
793     ret |= tmp << 16;
794     tmp = (uint8_t)(x >> 16);
795     ret |= tmp << 32;
796     tmp = (uint8_t)(x >> 24);
797     ret |= tmp << 48;
798     return ret;
799 }
800 
801 uint64_t HELPER(neon_widen_s8)(uint32_t x)
802 {
803     uint64_t tmp;
804     uint64_t ret;
805     ret = (uint16_t)(int8_t)x;
806     tmp = (uint16_t)(int8_t)(x >> 8);
807     ret |= tmp << 16;
808     tmp = (uint16_t)(int8_t)(x >> 16);
809     ret |= tmp << 32;
810     tmp = (uint16_t)(int8_t)(x >> 24);
811     ret |= tmp << 48;
812     return ret;
813 }
814 
815 uint64_t HELPER(neon_widen_u16)(uint32_t x)
816 {
817     uint64_t high = (uint16_t)(x >> 16);
818     return ((uint16_t)x) | (high << 32);
819 }
820 
821 uint64_t HELPER(neon_widen_s16)(uint32_t x)
822 {
823     uint64_t high = (int16_t)(x >> 16);
824     return ((uint32_t)(int16_t)x) | (high << 32);
825 }
826 
827 /* Pairwise long add: add pairs of adjacent elements into
828  * double-width elements in the result (eg _s8 is an 8x8->16 op)
829  */
830 uint64_t HELPER(neon_addlp_s8)(uint64_t a)
831 {
832     uint64_t nsignmask = 0x0080008000800080ULL;
833     uint64_t wsignmask = 0x8000800080008000ULL;
834     uint64_t elementmask = 0x00ff00ff00ff00ffULL;
835     uint64_t tmp1, tmp2;
836     uint64_t res, signres;
837 
838     /* Extract odd elements, sign extend each to a 16 bit field */
839     tmp1 = a & elementmask;
840     tmp1 ^= nsignmask;
841     tmp1 |= wsignmask;
842     tmp1 = (tmp1 - nsignmask) ^ wsignmask;
843     /* Ditto for the even elements */
844     tmp2 = (a >> 8) & elementmask;
845     tmp2 ^= nsignmask;
846     tmp2 |= wsignmask;
847     tmp2 = (tmp2 - nsignmask) ^ wsignmask;
848 
849     /* calculate the result by summing bits 0..14, 16..22, etc,
850      * and then adjusting the sign bits 15, 23, etc manually.
851      * This ensures the addition can't overflow the 16 bit field.
852      */
853     signres = (tmp1 ^ tmp2) & wsignmask;
854     res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
855     res ^= signres;
856 
857     return res;
858 }
859 
860 uint64_t HELPER(neon_addlp_s16)(uint64_t a)
861 {
862     int32_t reslo, reshi;
863 
864     reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
865     reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
866 
867     return (uint32_t)reslo | (((uint64_t)reshi) << 32);
868 }
869 
870 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
871 {
872     uint32_t x, y;
873     uint32_t low, high;
874 
875     x = a;
876     y = b;
877     low = x + y;
878     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
879         SET_QC();
880         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
881     }
882     x = a >> 32;
883     y = b >> 32;
884     high = x + y;
885     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
886         SET_QC();
887         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
888     }
889     return low | ((uint64_t)high << 32);
890 }
891 
892 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
893 {
894     uint64_t result;
895 
896     result = a + b;
897     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
898         SET_QC();
899         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
900     }
901     return result;
902 }
903 
904 /* We have to do the arithmetic in a larger type than
905  * the input type, because for example with a signed 32 bit
906  * op the absolute difference can overflow a signed 32 bit value.
907  */
908 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
909     arithtype tmp_x = (intype)(x);                            \
910     arithtype tmp_y = (intype)(y);                            \
911     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
912     } while(0)
913 
914 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
915 {
916     uint64_t tmp;
917     uint64_t result;
918     DO_ABD(result, a, b, uint8_t, uint32_t);
919     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
920     result |= tmp << 16;
921     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
922     result |= tmp << 32;
923     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
924     result |= tmp << 48;
925     return result;
926 }
927 
928 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
929 {
930     uint64_t tmp;
931     uint64_t result;
932     DO_ABD(result, a, b, int8_t, int32_t);
933     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
934     result |= tmp << 16;
935     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
936     result |= tmp << 32;
937     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
938     result |= tmp << 48;
939     return result;
940 }
941 
942 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
943 {
944     uint64_t tmp;
945     uint64_t result;
946     DO_ABD(result, a, b, uint16_t, uint32_t);
947     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
948     return result | (tmp << 32);
949 }
950 
951 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
952 {
953     uint64_t tmp;
954     uint64_t result;
955     DO_ABD(result, a, b, int16_t, int32_t);
956     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
957     return result | (tmp << 32);
958 }
959 
960 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
961 {
962     uint64_t result;
963     DO_ABD(result, a, b, uint32_t, uint64_t);
964     return result;
965 }
966 
967 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
968 {
969     uint64_t result;
970     DO_ABD(result, a, b, int32_t, int64_t);
971     return result;
972 }
973 #undef DO_ABD
974 
975 /* Widening multiply. Named type is the source type.  */
976 #define DO_MULL(dest, x, y, type1, type2) do { \
977     type1 tmp_x = x; \
978     type1 tmp_y = y; \
979     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
980     } while(0)
981 
982 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
983 {
984     uint64_t tmp;
985     uint64_t result;
986 
987     DO_MULL(result, a, b, uint8_t, uint16_t);
988     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
989     result |= tmp << 16;
990     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
991     result |= tmp << 32;
992     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
993     result |= tmp << 48;
994     return result;
995 }
996 
997 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
998 {
999     uint64_t tmp;
1000     uint64_t result;
1001 
1002     DO_MULL(result, a, b, int8_t, uint16_t);
1003     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1004     result |= tmp << 16;
1005     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1006     result |= tmp << 32;
1007     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1008     result |= tmp << 48;
1009     return result;
1010 }
1011 
1012 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1013 {
1014     uint64_t tmp;
1015     uint64_t result;
1016 
1017     DO_MULL(result, a, b, uint16_t, uint32_t);
1018     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1019     return result | (tmp << 32);
1020 }
1021 
1022 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1023 {
1024     uint64_t tmp;
1025     uint64_t result;
1026 
1027     DO_MULL(result, a, b, int16_t, uint32_t);
1028     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1029     return result | (tmp << 32);
1030 }
1031 
1032 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1033 {
1034     uint16_t tmp;
1035     uint64_t result;
1036     result = (uint16_t)-x;
1037     tmp = -(x >> 16);
1038     result |= (uint64_t)tmp << 16;
1039     tmp = -(x >> 32);
1040     result |= (uint64_t)tmp << 32;
1041     tmp = -(x >> 48);
1042     result |= (uint64_t)tmp << 48;
1043     return result;
1044 }
1045 
1046 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1047 {
1048     uint32_t low = -x;
1049     uint32_t high = -(x >> 32);
1050     return low | ((uint64_t)high << 32);
1051 }
1052 
1053 /* Saturating sign manipulation.  */
1054 /* ??? Make these use NEON_VOP1 */
1055 #define DO_QABS8(x) do { \
1056     if (x == (int8_t)0x80) { \
1057         x = 0x7f; \
1058         SET_QC(); \
1059     } else if (x < 0) { \
1060         x = -x; \
1061     }} while (0)
1062 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1063 {
1064     neon_s8 vec;
1065     NEON_UNPACK(neon_s8, vec, x);
1066     DO_QABS8(vec.v1);
1067     DO_QABS8(vec.v2);
1068     DO_QABS8(vec.v3);
1069     DO_QABS8(vec.v4);
1070     NEON_PACK(neon_s8, x, vec);
1071     return x;
1072 }
1073 #undef DO_QABS8
1074 
1075 #define DO_QNEG8(x) do { \
1076     if (x == (int8_t)0x80) { \
1077         x = 0x7f; \
1078         SET_QC(); \
1079     } else { \
1080         x = -x; \
1081     }} while (0)
1082 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1083 {
1084     neon_s8 vec;
1085     NEON_UNPACK(neon_s8, vec, x);
1086     DO_QNEG8(vec.v1);
1087     DO_QNEG8(vec.v2);
1088     DO_QNEG8(vec.v3);
1089     DO_QNEG8(vec.v4);
1090     NEON_PACK(neon_s8, x, vec);
1091     return x;
1092 }
1093 #undef DO_QNEG8
1094 
1095 #define DO_QABS16(x) do { \
1096     if (x == (int16_t)0x8000) { \
1097         x = 0x7fff; \
1098         SET_QC(); \
1099     } else if (x < 0) { \
1100         x = -x; \
1101     }} while (0)
1102 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1103 {
1104     neon_s16 vec;
1105     NEON_UNPACK(neon_s16, vec, x);
1106     DO_QABS16(vec.v1);
1107     DO_QABS16(vec.v2);
1108     NEON_PACK(neon_s16, x, vec);
1109     return x;
1110 }
1111 #undef DO_QABS16
1112 
1113 #define DO_QNEG16(x) do { \
1114     if (x == (int16_t)0x8000) { \
1115         x = 0x7fff; \
1116         SET_QC(); \
1117     } else { \
1118         x = -x; \
1119     }} while (0)
1120 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1121 {
1122     neon_s16 vec;
1123     NEON_UNPACK(neon_s16, vec, x);
1124     DO_QNEG16(vec.v1);
1125     DO_QNEG16(vec.v2);
1126     NEON_PACK(neon_s16, x, vec);
1127     return x;
1128 }
1129 #undef DO_QNEG16
1130 
1131 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1132 {
1133     if (x == SIGNBIT) {
1134         SET_QC();
1135         x = ~SIGNBIT;
1136     } else if ((int32_t)x < 0) {
1137         x = -x;
1138     }
1139     return x;
1140 }
1141 
1142 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1143 {
1144     if (x == SIGNBIT) {
1145         SET_QC();
1146         x = ~SIGNBIT;
1147     } else {
1148         x = -x;
1149     }
1150     return x;
1151 }
1152 
1153 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1154 {
1155     if (x == SIGNBIT64) {
1156         SET_QC();
1157         x = ~SIGNBIT64;
1158     } else if ((int64_t)x < 0) {
1159         x = -x;
1160     }
1161     return x;
1162 }
1163 
1164 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1165 {
1166     if (x == SIGNBIT64) {
1167         SET_QC();
1168         x = ~SIGNBIT64;
1169     } else {
1170         x = -x;
1171     }
1172     return x;
1173 }
1174 
1175 /* NEON Float helpers.  */
1176 
1177 /* Floating point comparisons produce an integer result.
1178  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1179  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1180  */
1181 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, float_status *fpst)
1182 {
1183     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1184 }
1185 
1186 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1187 {
1188     return -float32_le(make_float32(b), make_float32(a), fpst);
1189 }
1190 
1191 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1192 {
1193     return -float32_lt(make_float32(b), make_float32(a), fpst);
1194 }
1195 
1196 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1197 {
1198     float32 f0 = float32_abs(make_float32(a));
1199     float32 f1 = float32_abs(make_float32(b));
1200     return -float32_le(f1, f0, fpst);
1201 }
1202 
1203 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1204 {
1205     float32 f0 = float32_abs(make_float32(a));
1206     float32 f1 = float32_abs(make_float32(b));
1207     return -float32_lt(f1, f0, fpst);
1208 }
1209 
1210 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, float_status *fpst)
1211 {
1212     float64 f0 = float64_abs(make_float64(a));
1213     float64 f1 = float64_abs(make_float64(b));
1214     return -float64_le(f1, f0, fpst);
1215 }
1216 
1217 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, float_status *fpst)
1218 {
1219     float64 f0 = float64_abs(make_float64(a));
1220     float64 f1 = float64_abs(make_float64(b));
1221     return -float64_lt(f1, f0, fpst);
1222 }
1223 
1224 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1225 
1226 void HELPER(neon_qunzip8)(void *vd, void *vm)
1227 {
1228     uint64_t *rd = vd, *rm = vm;
1229     uint64_t zd0 = rd[0], zd1 = rd[1];
1230     uint64_t zm0 = rm[0], zm1 = rm[1];
1231 
1232     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1233         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1234         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1235         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1236     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1237         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1238         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1239         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1240     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1241         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1242         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1243         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1244     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1245         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1246         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1247         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1248 
1249     rm[0] = m0;
1250     rm[1] = m1;
1251     rd[0] = d0;
1252     rd[1] = d1;
1253 }
1254 
1255 void HELPER(neon_qunzip16)(void *vd, void *vm)
1256 {
1257     uint64_t *rd = vd, *rm = vm;
1258     uint64_t zd0 = rd[0], zd1 = rd[1];
1259     uint64_t zm0 = rm[0], zm1 = rm[1];
1260 
1261     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1262         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1263     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1264         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1265     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1266         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1267     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1268         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1269 
1270     rm[0] = m0;
1271     rm[1] = m1;
1272     rd[0] = d0;
1273     rd[1] = d1;
1274 }
1275 
1276 void HELPER(neon_qunzip32)(void *vd, void *vm)
1277 {
1278     uint64_t *rd = vd, *rm = vm;
1279     uint64_t zd0 = rd[0], zd1 = rd[1];
1280     uint64_t zm0 = rm[0], zm1 = rm[1];
1281 
1282     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1283     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1284     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1285     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1286 
1287     rm[0] = m0;
1288     rm[1] = m1;
1289     rd[0] = d0;
1290     rd[1] = d1;
1291 }
1292 
1293 void HELPER(neon_unzip8)(void *vd, void *vm)
1294 {
1295     uint64_t *rd = vd, *rm = vm;
1296     uint64_t zd = rd[0], zm = rm[0];
1297 
1298     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1299         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1300         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1301         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1302     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1303         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1304         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1305         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1306 
1307     rm[0] = m0;
1308     rd[0] = d0;
1309 }
1310 
1311 void HELPER(neon_unzip16)(void *vd, void *vm)
1312 {
1313     uint64_t *rd = vd, *rm = vm;
1314     uint64_t zd = rd[0], zm = rm[0];
1315 
1316     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1317         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1318     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1319         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1320 
1321     rm[0] = m0;
1322     rd[0] = d0;
1323 }
1324 
1325 void HELPER(neon_qzip8)(void *vd, void *vm)
1326 {
1327     uint64_t *rd = vd, *rm = vm;
1328     uint64_t zd0 = rd[0], zd1 = rd[1];
1329     uint64_t zm0 = rm[0], zm1 = rm[1];
1330 
1331     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1332         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1333         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1334         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1335     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1336         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1337         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1338         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1339     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1340         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1341         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1342         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1343     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1344         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1345         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1346         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1347 
1348     rm[0] = m0;
1349     rm[1] = m1;
1350     rd[0] = d0;
1351     rd[1] = d1;
1352 }
1353 
1354 void HELPER(neon_qzip16)(void *vd, void *vm)
1355 {
1356     uint64_t *rd = vd, *rm = vm;
1357     uint64_t zd0 = rd[0], zd1 = rd[1];
1358     uint64_t zm0 = rm[0], zm1 = rm[1];
1359 
1360     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1361         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1362     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1363         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1364     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1365         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1366     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1367         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1368 
1369     rm[0] = m0;
1370     rm[1] = m1;
1371     rd[0] = d0;
1372     rd[1] = d1;
1373 }
1374 
1375 void HELPER(neon_qzip32)(void *vd, void *vm)
1376 {
1377     uint64_t *rd = vd, *rm = vm;
1378     uint64_t zd0 = rd[0], zd1 = rd[1];
1379     uint64_t zm0 = rm[0], zm1 = rm[1];
1380 
1381     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1382     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1383     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1384     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1385 
1386     rm[0] = m0;
1387     rm[1] = m1;
1388     rd[0] = d0;
1389     rd[1] = d1;
1390 }
1391 
1392 void HELPER(neon_zip8)(void *vd, void *vm)
1393 {
1394     uint64_t *rd = vd, *rm = vm;
1395     uint64_t zd = rd[0], zm = rm[0];
1396 
1397     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1398         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1399         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1400         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1401     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1402         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1403         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1404         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1405 
1406     rm[0] = m0;
1407     rd[0] = d0;
1408 }
1409 
1410 void HELPER(neon_zip16)(void *vd, void *vm)
1411 {
1412     uint64_t *rd = vd, *rm = vm;
1413     uint64_t zd = rd[0], zm = rm[0];
1414 
1415     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1416         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1417     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1418         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1419 
1420     rm[0] = m0;
1421     rd[0] = d0;
1422 }
1423