1 /*
2 * ARM NEON vector operations.
3 *
4 * Copyright (c) 2007, 2008 CodeSourcery.
5 * Written by Paul Brook
6 *
7 * This code is licensed under the GNU GPL v2.
8 */
9
10 #include "qemu/osdep.h"
11 #include "cpu.h"
12 #include "tcg/tcg-gvec-desc.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15
16 #define HELPER_H "tcg/helper.h"
17 #include "exec/helper-proto.h.inc"
18
19 #define SIGNBIT (uint32_t)0x80000000
20 #define SIGNBIT64 ((uint64_t)1 << 63)
21
22 #define SET_QC() env->vfp.qc[0] = 1
23
24 #define NEON_TYPE1(name, type) \
25 typedef struct \
26 { \
27 type v1; \
28 } neon_##name;
29 #if HOST_BIG_ENDIAN
30 #define NEON_TYPE2(name, type) \
31 typedef struct \
32 { \
33 type v2; \
34 type v1; \
35 } neon_##name;
36 #define NEON_TYPE4(name, type) \
37 typedef struct \
38 { \
39 type v4; \
40 type v3; \
41 type v2; \
42 type v1; \
43 } neon_##name;
44 #else
45 #define NEON_TYPE2(name, type) \
46 typedef struct \
47 { \
48 type v1; \
49 type v2; \
50 } neon_##name;
51 #define NEON_TYPE4(name, type) \
52 typedef struct \
53 { \
54 type v1; \
55 type v2; \
56 type v3; \
57 type v4; \
58 } neon_##name;
59 #endif
60
NEON_TYPE4(s8,int8_t)61 NEON_TYPE4(s8, int8_t)
62 NEON_TYPE4(u8, uint8_t)
63 NEON_TYPE2(s16, int16_t)
64 NEON_TYPE2(u16, uint16_t)
65 NEON_TYPE1(s32, int32_t)
66 NEON_TYPE1(u32, uint32_t)
67 #undef NEON_TYPE4
68 #undef NEON_TYPE2
69 #undef NEON_TYPE1
70
71 /* Copy from a uint32_t to a vector structure type. */
72 #define NEON_UNPACK(vtype, dest, val) do { \
73 union { \
74 vtype v; \
75 uint32_t i; \
76 } conv_u; \
77 conv_u.i = (val); \
78 dest = conv_u.v; \
79 } while(0)
80
81 /* Copy from a vector structure type to a uint32_t. */
82 #define NEON_PACK(vtype, dest, val) do { \
83 union { \
84 vtype v; \
85 uint32_t i; \
86 } conv_u; \
87 conv_u.v = (val); \
88 dest = conv_u.i; \
89 } while(0)
90
91 #define NEON_DO1 \
92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
93 #define NEON_DO2 \
94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
96 #define NEON_DO4 \
97 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
98 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
99 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
100 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
101
102 #define NEON_VOP_BODY(vtype, n) \
103 { \
104 uint32_t res; \
105 vtype vsrc1; \
106 vtype vsrc2; \
107 vtype vdest; \
108 NEON_UNPACK(vtype, vsrc1, arg1); \
109 NEON_UNPACK(vtype, vsrc2, arg2); \
110 NEON_DO##n; \
111 NEON_PACK(vtype, res, vdest); \
112 return res; \
113 }
114
115 #define NEON_VOP(name, vtype, n) \
116 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
117 NEON_VOP_BODY(vtype, n)
118
119 #define NEON_VOP_ENV(name, vtype, n) \
120 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
121 NEON_VOP_BODY(vtype, n)
122
123 #define NEON_GVEC_VOP2(name, vtype) \
124 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \
125 { \
126 intptr_t i, opr_sz = simd_oprsz(desc); \
127 vtype *d = vd, *n = vn, *m = vm; \
128 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
129 NEON_FN(d[i], n[i], m[i]); \
130 } \
131 clear_tail(d, opr_sz, simd_maxsz(desc)); \
132 }
133
134 #define NEON_GVEC_VOP2_ENV(name, vtype) \
135 void HELPER(name)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) \
136 { \
137 intptr_t i, opr_sz = simd_oprsz(desc); \
138 vtype *d = vd, *n = vn, *m = vm; \
139 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
140 NEON_FN(d[i], n[i], m[i]); \
141 } \
142 clear_tail(d, opr_sz, simd_maxsz(desc)); \
143 }
144
145 #define NEON_GVEC_VOP2i_ENV(name, vtype) \
146 void HELPER(name)(void *vd, void *vn, CPUARMState *env, uint32_t desc) \
147 { \
148 intptr_t i, opr_sz = simd_oprsz(desc); \
149 int imm = simd_data(desc); \
150 vtype *d = vd, *n = vn; \
151 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
152 NEON_FN(d[i], n[i], imm); \
153 } \
154 clear_tail(d, opr_sz, simd_maxsz(desc)); \
155 }
156
157 /* Pairwise operations. */
158 /* For 32-bit elements each segment only contains a single element, so
159 the elementwise and pairwise operations are the same. */
160 #define NEON_PDO2 \
161 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
162 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
163 #define NEON_PDO4 \
164 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
165 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
166 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
167 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
168
169 #define NEON_POP(name, vtype, n) \
170 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
171 { \
172 uint32_t res; \
173 vtype vsrc1; \
174 vtype vsrc2; \
175 vtype vdest; \
176 NEON_UNPACK(vtype, vsrc1, arg1); \
177 NEON_UNPACK(vtype, vsrc2, arg2); \
178 NEON_PDO##n; \
179 NEON_PACK(vtype, res, vdest); \
180 return res; \
181 }
182
183 /* Unary operators. */
184 #define NEON_VOP1(name, vtype, n) \
185 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
186 { \
187 vtype vsrc1; \
188 vtype vdest; \
189 NEON_UNPACK(vtype, vsrc1, arg); \
190 NEON_DO##n; \
191 NEON_PACK(vtype, arg, vdest); \
192 return arg; \
193 }
194
195 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
196 NEON_POP(pmin_s8, neon_s8, 4)
197 NEON_POP(pmin_u8, neon_u8, 4)
198 NEON_POP(pmin_s16, neon_s16, 2)
199 NEON_POP(pmin_u16, neon_u16, 2)
200 #undef NEON_FN
201
202 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
203 NEON_POP(pmax_s8, neon_s8, 4)
204 NEON_POP(pmax_u8, neon_u8, 4)
205 NEON_POP(pmax_s16, neon_s16, 2)
206 NEON_POP(pmax_u16, neon_u16, 2)
207 #undef NEON_FN
208
209 #define NEON_FN(dest, src1, src2) \
210 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
211 NEON_VOP(shl_u16, neon_u16, 2)
212 #undef NEON_FN
213
214 #define NEON_FN(dest, src1, src2) \
215 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
216 NEON_VOP(shl_s16, neon_s16, 2)
217 #undef NEON_FN
218
219 #define NEON_FN(dest, src1, src2) \
220 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
221 NEON_VOP(rshl_s8, neon_s8, 4)
222 NEON_GVEC_VOP2(gvec_srshl_b, int8_t)
223 #undef NEON_FN
224
225 #define NEON_FN(dest, src1, src2) \
226 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
227 NEON_VOP(rshl_s16, neon_s16, 2)
228 NEON_GVEC_VOP2(gvec_srshl_h, int16_t)
229 #undef NEON_FN
230
231 #define NEON_FN(dest, src1, src2) \
232 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
233 NEON_GVEC_VOP2(gvec_srshl_s, int32_t)
234 #undef NEON_FN
235
236 #define NEON_FN(dest, src1, src2) \
237 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL))
238 NEON_GVEC_VOP2(gvec_srshl_d, int64_t)
239 #undef NEON_FN
240
241 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
242 {
243 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
244 }
245
HELPER(neon_rshl_s64)246 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
247 {
248 return do_sqrshl_d(val, (int8_t)shift, true, NULL);
249 }
250
251 #define NEON_FN(dest, src1, src2) \
252 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
253 NEON_VOP(rshl_u8, neon_u8, 4)
NEON_GVEC_VOP2(gvec_urshl_b,uint8_t)254 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t)
255 #undef NEON_FN
256
257 #define NEON_FN(dest, src1, src2) \
258 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
259 NEON_VOP(rshl_u16, neon_u16, 2)
260 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t)
261 #undef NEON_FN
262
263 #define NEON_FN(dest, src1, src2) \
264 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL))
265 NEON_GVEC_VOP2(gvec_urshl_s, int32_t)
266 #undef NEON_FN
267
268 #define NEON_FN(dest, src1, src2) \
269 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL))
270 NEON_GVEC_VOP2(gvec_urshl_d, int64_t)
271 #undef NEON_FN
272
273 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
274 {
275 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
276 }
277
HELPER(neon_rshl_u64)278 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
279 {
280 return do_uqrshl_d(val, (int8_t)shift, true, NULL);
281 }
282
283 #define NEON_FN(dest, src1, src2) \
284 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
285 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
NEON_GVEC_VOP2_ENV(neon_uqshl_b,uint8_t)286 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
287 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
288 #undef NEON_FN
289
290 #define NEON_FN(dest, src1, src2) \
291 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
292 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
293 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
294 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
295 #undef NEON_FN
296
297 #define NEON_FN(dest, src1, src2) \
298 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
299 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
300 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
301 #undef NEON_FN
302
303 #define NEON_FN(dest, src1, src2) \
304 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
305 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
306 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
307 #undef NEON_FN
308
309 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
310 {
311 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
312 }
313
HELPER(neon_qshl_u64)314 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
315 {
316 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
317 }
318
319 #define NEON_FN(dest, src1, src2) \
320 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
321 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqshl_b,int8_t)322 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
323 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
324 #undef NEON_FN
325
326 #define NEON_FN(dest, src1, src2) \
327 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
328 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
329 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
330 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
331 #undef NEON_FN
332
333 #define NEON_FN(dest, src1, src2) \
334 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
335 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
336 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
337 #undef NEON_FN
338
339 #define NEON_FN(dest, src1, src2) \
340 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
341 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
342 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
343 #undef NEON_FN
344
345 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
346 {
347 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
348 }
349
HELPER(neon_qshl_s64)350 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
351 {
352 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
353 }
354
355 #define NEON_FN(dest, src1, src2) \
356 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
357 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
NEON_GVEC_VOP2i_ENV(neon_sqshlui_b,int8_t)358 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
359 #undef NEON_FN
360
361 #define NEON_FN(dest, src1, src2) \
362 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
363 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
364 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
365 #undef NEON_FN
366
367 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
368 {
369 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
370 }
371
HELPER(neon_qshlu_s64)372 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
373 {
374 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
375 }
376
377 #define NEON_FN(dest, src1, src2) \
378 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
NEON_GVEC_VOP2i_ENV(neon_sqshlui_s,int32_t)379 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
380 #undef NEON_FN
381
382 #define NEON_FN(dest, src1, src2) \
383 (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
384 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
385 #undef NEON_FN
386
387 #define NEON_FN(dest, src1, src2) \
388 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
389 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
390 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t)
391 #undef NEON_FN
392
393 #define NEON_FN(dest, src1, src2) \
394 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
395 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
396 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t)
397 #undef NEON_FN
398
399 #define NEON_FN(dest, src1, src2) \
400 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
401 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t)
402 #undef NEON_FN
403
404 #define NEON_FN(dest, src1, src2) \
405 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
406 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t)
407 #undef NEON_FN
408
409 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
410 {
411 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
412 }
413
HELPER(neon_qrshl_u64)414 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
415 {
416 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
417 }
418
419 #define NEON_FN(dest, src1, src2) \
420 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
421 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqrshl_b,int8_t)422 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t)
423 #undef NEON_FN
424
425 #define NEON_FN(dest, src1, src2) \
426 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
427 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
428 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t)
429 #undef NEON_FN
430
431 #define NEON_FN(dest, src1, src2) \
432 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc))
433 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t)
434 #undef NEON_FN
435
436 #define NEON_FN(dest, src1, src2) \
437 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc))
438 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t)
439 #undef NEON_FN
440
441 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
442 {
443 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
444 }
445
HELPER(neon_qrshl_s64)446 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
447 {
448 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
449 }
450
HELPER(neon_add_u8)451 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
452 {
453 uint32_t mask;
454 mask = (a ^ b) & 0x80808080u;
455 a &= ~0x80808080u;
456 b &= ~0x80808080u;
457 return (a + b) ^ mask;
458 }
459
HELPER(neon_add_u16)460 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
461 {
462 uint32_t mask;
463 mask = (a ^ b) & 0x80008000u;
464 a &= ~0x80008000u;
465 b &= ~0x80008000u;
466 return (a + b) ^ mask;
467 }
468
469 #define NEON_FN(dest, src1, src2) dest = src1 - src2
470 NEON_VOP(sub_u8, neon_u8, 4)
471 NEON_VOP(sub_u16, neon_u16, 2)
472 #undef NEON_FN
473
474 #define NEON_FN(dest, src1, src2) dest = src1 * src2
475 NEON_VOP(mul_u8, neon_u8, 4)
476 NEON_VOP(mul_u16, neon_u16, 2)
477 #undef NEON_FN
478
479 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
480 NEON_VOP(tst_u8, neon_u8, 4)
481 NEON_VOP(tst_u16, neon_u16, 2)
482 NEON_VOP(tst_u32, neon_u32, 1)
483 #undef NEON_FN
484
485 /* Count Leading Sign/Zero Bits. */
do_clz8(uint8_t x)486 static inline int do_clz8(uint8_t x)
487 {
488 int n;
489 for (n = 8; x; n--)
490 x >>= 1;
491 return n;
492 }
493
do_clz16(uint16_t x)494 static inline int do_clz16(uint16_t x)
495 {
496 int n;
497 for (n = 16; x; n--)
498 x >>= 1;
499 return n;
500 }
501
502 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
503 NEON_VOP1(clz_u8, neon_u8, 4)
504 #undef NEON_FN
505
506 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
507 NEON_VOP1(clz_u16, neon_u16, 2)
508 #undef NEON_FN
509
510 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
511 NEON_VOP1(cls_s8, neon_s8, 4)
512 #undef NEON_FN
513
514 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
515 NEON_VOP1(cls_s16, neon_s16, 2)
516 #undef NEON_FN
517
HELPER(neon_cls_s32)518 uint32_t HELPER(neon_cls_s32)(uint32_t x)
519 {
520 int count;
521 if ((int32_t)x < 0)
522 x = ~x;
523 for (count = 32; x; count--)
524 x = x >> 1;
525 return count - 1;
526 }
527
528 #define NEON_QDMULH16(dest, src1, src2, round) do { \
529 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
530 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
531 SET_QC(); \
532 tmp = (tmp >> 31) ^ ~SIGNBIT; \
533 } else { \
534 tmp <<= 1; \
535 } \
536 if (round) { \
537 int32_t old = tmp; \
538 tmp += 1 << 15; \
539 if ((int32_t)tmp < old) { \
540 SET_QC(); \
541 tmp = SIGNBIT - 1; \
542 } \
543 } \
544 dest = tmp >> 16; \
545 } while(0)
546 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
547 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
548 #undef NEON_FN
549 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
550 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
551 #undef NEON_FN
552 #undef NEON_QDMULH16
553
554 #define NEON_QDMULH32(dest, src1, src2, round) do { \
555 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
556 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
557 SET_QC(); \
558 tmp = (tmp >> 63) ^ ~SIGNBIT64; \
559 } else { \
560 tmp <<= 1; \
561 } \
562 if (round) { \
563 int64_t old = tmp; \
564 tmp += (int64_t)1 << 31; \
565 if ((int64_t)tmp < old) { \
566 SET_QC(); \
567 tmp = SIGNBIT64 - 1; \
568 } \
569 } \
570 dest = tmp >> 32; \
571 } while(0)
572 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
573 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
574 #undef NEON_FN
575 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
576 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
577 #undef NEON_FN
578 #undef NEON_QDMULH32
579
580 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_u8)581 uint64_t HELPER(neon_narrow_u8)(uint64_t x)
582 {
583 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
584 | ((x >> 24) & 0xff000000u);
585 }
586
587 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_u16)588 uint64_t HELPER(neon_narrow_u16)(uint64_t x)
589 {
590 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
591 }
592
HELPER(neon_narrow_high_u8)593 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
594 {
595 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
596 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
597 }
598
HELPER(neon_narrow_high_u16)599 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
600 {
601 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
602 }
603
HELPER(neon_narrow_round_high_u8)604 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
605 {
606 x &= 0xff80ff80ff80ff80ull;
607 x += 0x0080008000800080ull;
608 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
609 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
610 }
611
HELPER(neon_narrow_round_high_u16)612 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
613 {
614 x &= 0xffff8000ffff8000ull;
615 x += 0x0000800000008000ull;
616 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
617 }
618
619 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat8)620 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
621 {
622 uint16_t s;
623 uint8_t d;
624 uint32_t res = 0;
625 #define SAT8(n) \
626 s = x >> n; \
627 if (s & 0x8000) { \
628 SET_QC(); \
629 } else { \
630 if (s > 0xff) { \
631 d = 0xff; \
632 SET_QC(); \
633 } else { \
634 d = s; \
635 } \
636 res |= (uint32_t)d << (n / 2); \
637 }
638
639 SAT8(0);
640 SAT8(16);
641 SAT8(32);
642 SAT8(48);
643 #undef SAT8
644 return res;
645 }
646
647 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u8)648 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
649 {
650 uint16_t s;
651 uint8_t d;
652 uint32_t res = 0;
653 #define SAT8(n) \
654 s = x >> n; \
655 if (s > 0xff) { \
656 d = 0xff; \
657 SET_QC(); \
658 } else { \
659 d = s; \
660 } \
661 res |= (uint32_t)d << (n / 2);
662
663 SAT8(0);
664 SAT8(16);
665 SAT8(32);
666 SAT8(48);
667 #undef SAT8
668 return res;
669 }
670
671 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s8)672 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
673 {
674 int16_t s;
675 uint8_t d;
676 uint32_t res = 0;
677 #define SAT8(n) \
678 s = x >> n; \
679 if (s != (int8_t)s) { \
680 d = (s >> 15) ^ 0x7f; \
681 SET_QC(); \
682 } else { \
683 d = s; \
684 } \
685 res |= (uint32_t)d << (n / 2);
686
687 SAT8(0);
688 SAT8(16);
689 SAT8(32);
690 SAT8(48);
691 #undef SAT8
692 return res;
693 }
694
695 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat16)696 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
697 {
698 uint32_t high;
699 uint32_t low;
700 low = x;
701 if (low & 0x80000000) {
702 low = 0;
703 SET_QC();
704 } else if (low > 0xffff) {
705 low = 0xffff;
706 SET_QC();
707 }
708 high = x >> 32;
709 if (high & 0x80000000) {
710 high = 0;
711 SET_QC();
712 } else if (high > 0xffff) {
713 high = 0xffff;
714 SET_QC();
715 }
716 return deposit32(low, 16, 16, high);
717 }
718
719 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u16)720 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
721 {
722 uint32_t high;
723 uint32_t low;
724 low = x;
725 if (low > 0xffff) {
726 low = 0xffff;
727 SET_QC();
728 }
729 high = x >> 32;
730 if (high > 0xffff) {
731 high = 0xffff;
732 SET_QC();
733 }
734 return deposit32(low, 16, 16, high);
735 }
736
737 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s16)738 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
739 {
740 int32_t low;
741 int32_t high;
742 low = x;
743 if (low != (int16_t)low) {
744 low = (low >> 31) ^ 0x7fff;
745 SET_QC();
746 }
747 high = x >> 32;
748 if (high != (int16_t)high) {
749 high = (high >> 31) ^ 0x7fff;
750 SET_QC();
751 }
752 return deposit32(low, 16, 16, high);
753 }
754
755 /* Only the low 32-bits of output are significant. */
HELPER(neon_unarrow_sat32)756 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
757 {
758 if (x & 0x8000000000000000ull) {
759 SET_QC();
760 return 0;
761 }
762 if (x > 0xffffffffu) {
763 SET_QC();
764 return 0xffffffffu;
765 }
766 return x;
767 }
768
769 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_u32)770 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
771 {
772 if (x > 0xffffffffu) {
773 SET_QC();
774 return 0xffffffffu;
775 }
776 return x;
777 }
778
779 /* Only the low 32-bits of output are significant. */
HELPER(neon_narrow_sat_s32)780 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
781 {
782 if ((int64_t)x != (int32_t)x) {
783 SET_QC();
784 return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff;
785 }
786 return (uint32_t)x;
787 }
788
HELPER(neon_widen_u8)789 uint64_t HELPER(neon_widen_u8)(uint32_t x)
790 {
791 uint64_t tmp;
792 uint64_t ret;
793 ret = (uint8_t)x;
794 tmp = (uint8_t)(x >> 8);
795 ret |= tmp << 16;
796 tmp = (uint8_t)(x >> 16);
797 ret |= tmp << 32;
798 tmp = (uint8_t)(x >> 24);
799 ret |= tmp << 48;
800 return ret;
801 }
802
HELPER(neon_widen_s8)803 uint64_t HELPER(neon_widen_s8)(uint32_t x)
804 {
805 uint64_t tmp;
806 uint64_t ret;
807 ret = (uint16_t)(int8_t)x;
808 tmp = (uint16_t)(int8_t)(x >> 8);
809 ret |= tmp << 16;
810 tmp = (uint16_t)(int8_t)(x >> 16);
811 ret |= tmp << 32;
812 tmp = (uint16_t)(int8_t)(x >> 24);
813 ret |= tmp << 48;
814 return ret;
815 }
816
HELPER(neon_widen_u16)817 uint64_t HELPER(neon_widen_u16)(uint32_t x)
818 {
819 uint64_t high = (uint16_t)(x >> 16);
820 return ((uint16_t)x) | (high << 32);
821 }
822
HELPER(neon_widen_s16)823 uint64_t HELPER(neon_widen_s16)(uint32_t x)
824 {
825 uint64_t high = (int16_t)(x >> 16);
826 return ((uint32_t)(int16_t)x) | (high << 32);
827 }
828
829 /* Pairwise long add: add pairs of adjacent elements into
830 * double-width elements in the result (eg _s8 is an 8x8->16 op)
831 */
HELPER(neon_addlp_s8)832 uint64_t HELPER(neon_addlp_s8)(uint64_t a)
833 {
834 uint64_t nsignmask = 0x0080008000800080ULL;
835 uint64_t wsignmask = 0x8000800080008000ULL;
836 uint64_t elementmask = 0x00ff00ff00ff00ffULL;
837 uint64_t tmp1, tmp2;
838 uint64_t res, signres;
839
840 /* Extract odd elements, sign extend each to a 16 bit field */
841 tmp1 = a & elementmask;
842 tmp1 ^= nsignmask;
843 tmp1 |= wsignmask;
844 tmp1 = (tmp1 - nsignmask) ^ wsignmask;
845 /* Ditto for the even elements */
846 tmp2 = (a >> 8) & elementmask;
847 tmp2 ^= nsignmask;
848 tmp2 |= wsignmask;
849 tmp2 = (tmp2 - nsignmask) ^ wsignmask;
850
851 /* calculate the result by summing bits 0..14, 16..22, etc,
852 * and then adjusting the sign bits 15, 23, etc manually.
853 * This ensures the addition can't overflow the 16 bit field.
854 */
855 signres = (tmp1 ^ tmp2) & wsignmask;
856 res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
857 res ^= signres;
858
859 return res;
860 }
861
HELPER(neon_addlp_s16)862 uint64_t HELPER(neon_addlp_s16)(uint64_t a)
863 {
864 int32_t reslo, reshi;
865
866 reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
867 reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
868
869 return (uint32_t)reslo | (((uint64_t)reshi) << 32);
870 }
871
HELPER(neon_addl_saturate_s32)872 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
873 {
874 uint32_t x, y;
875 uint32_t low, high;
876
877 x = a;
878 y = b;
879 low = x + y;
880 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
881 SET_QC();
882 low = ((int32_t)x >> 31) ^ ~SIGNBIT;
883 }
884 x = a >> 32;
885 y = b >> 32;
886 high = x + y;
887 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
888 SET_QC();
889 high = ((int32_t)x >> 31) ^ ~SIGNBIT;
890 }
891 return low | ((uint64_t)high << 32);
892 }
893
HELPER(neon_addl_saturate_s64)894 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
895 {
896 uint64_t result;
897
898 result = a + b;
899 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
900 SET_QC();
901 result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
902 }
903 return result;
904 }
905
906 /* We have to do the arithmetic in a larger type than
907 * the input type, because for example with a signed 32 bit
908 * op the absolute difference can overflow a signed 32 bit value.
909 */
910 #define DO_ABD(dest, x, y, intype, arithtype) do { \
911 arithtype tmp_x = (intype)(x); \
912 arithtype tmp_y = (intype)(y); \
913 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
914 } while(0)
915
HELPER(neon_abdl_u16)916 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
917 {
918 uint64_t tmp;
919 uint64_t result;
920 DO_ABD(result, a, b, uint8_t, uint32_t);
921 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
922 result |= tmp << 16;
923 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
924 result |= tmp << 32;
925 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
926 result |= tmp << 48;
927 return result;
928 }
929
HELPER(neon_abdl_s16)930 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
931 {
932 uint64_t tmp;
933 uint64_t result;
934 DO_ABD(result, a, b, int8_t, int32_t);
935 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
936 result |= tmp << 16;
937 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
938 result |= tmp << 32;
939 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
940 result |= tmp << 48;
941 return result;
942 }
943
HELPER(neon_abdl_u32)944 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
945 {
946 uint64_t tmp;
947 uint64_t result;
948 DO_ABD(result, a, b, uint16_t, uint32_t);
949 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
950 return result | (tmp << 32);
951 }
952
HELPER(neon_abdl_s32)953 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
954 {
955 uint64_t tmp;
956 uint64_t result;
957 DO_ABD(result, a, b, int16_t, int32_t);
958 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
959 return result | (tmp << 32);
960 }
961
HELPER(neon_abdl_u64)962 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
963 {
964 uint64_t result;
965 DO_ABD(result, a, b, uint32_t, uint64_t);
966 return result;
967 }
968
HELPER(neon_abdl_s64)969 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
970 {
971 uint64_t result;
972 DO_ABD(result, a, b, int32_t, int64_t);
973 return result;
974 }
975 #undef DO_ABD
976
977 /* Widening multiply. Named type is the source type. */
978 #define DO_MULL(dest, x, y, type1, type2) do { \
979 type1 tmp_x = x; \
980 type1 tmp_y = y; \
981 dest = (type2)((type2)tmp_x * (type2)tmp_y); \
982 } while(0)
983
HELPER(neon_mull_u8)984 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
985 {
986 uint64_t tmp;
987 uint64_t result;
988
989 DO_MULL(result, a, b, uint8_t, uint16_t);
990 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
991 result |= tmp << 16;
992 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
993 result |= tmp << 32;
994 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
995 result |= tmp << 48;
996 return result;
997 }
998
HELPER(neon_mull_s8)999 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1000 {
1001 uint64_t tmp;
1002 uint64_t result;
1003
1004 DO_MULL(result, a, b, int8_t, uint16_t);
1005 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1006 result |= tmp << 16;
1007 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1008 result |= tmp << 32;
1009 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1010 result |= tmp << 48;
1011 return result;
1012 }
1013
HELPER(neon_mull_u16)1014 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1015 {
1016 uint64_t tmp;
1017 uint64_t result;
1018
1019 DO_MULL(result, a, b, uint16_t, uint32_t);
1020 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1021 return result | (tmp << 32);
1022 }
1023
HELPER(neon_mull_s16)1024 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1025 {
1026 uint64_t tmp;
1027 uint64_t result;
1028
1029 DO_MULL(result, a, b, int16_t, uint32_t);
1030 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1031 return result | (tmp << 32);
1032 }
1033
HELPER(neon_negl_u16)1034 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1035 {
1036 uint16_t tmp;
1037 uint64_t result;
1038 result = (uint16_t)-x;
1039 tmp = -(x >> 16);
1040 result |= (uint64_t)tmp << 16;
1041 tmp = -(x >> 32);
1042 result |= (uint64_t)tmp << 32;
1043 tmp = -(x >> 48);
1044 result |= (uint64_t)tmp << 48;
1045 return result;
1046 }
1047
HELPER(neon_negl_u32)1048 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1049 {
1050 uint32_t low = -x;
1051 uint32_t high = -(x >> 32);
1052 return low | ((uint64_t)high << 32);
1053 }
1054
1055 /* Saturating sign manipulation. */
1056 /* ??? Make these use NEON_VOP1 */
1057 #define DO_QABS8(x) do { \
1058 if (x == (int8_t)0x80) { \
1059 x = 0x7f; \
1060 SET_QC(); \
1061 } else if (x < 0) { \
1062 x = -x; \
1063 }} while (0)
HELPER(neon_qabs_s8)1064 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1065 {
1066 neon_s8 vec;
1067 NEON_UNPACK(neon_s8, vec, x);
1068 DO_QABS8(vec.v1);
1069 DO_QABS8(vec.v2);
1070 DO_QABS8(vec.v3);
1071 DO_QABS8(vec.v4);
1072 NEON_PACK(neon_s8, x, vec);
1073 return x;
1074 }
1075 #undef DO_QABS8
1076
1077 #define DO_QNEG8(x) do { \
1078 if (x == (int8_t)0x80) { \
1079 x = 0x7f; \
1080 SET_QC(); \
1081 } else { \
1082 x = -x; \
1083 }} while (0)
HELPER(neon_qneg_s8)1084 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1085 {
1086 neon_s8 vec;
1087 NEON_UNPACK(neon_s8, vec, x);
1088 DO_QNEG8(vec.v1);
1089 DO_QNEG8(vec.v2);
1090 DO_QNEG8(vec.v3);
1091 DO_QNEG8(vec.v4);
1092 NEON_PACK(neon_s8, x, vec);
1093 return x;
1094 }
1095 #undef DO_QNEG8
1096
1097 #define DO_QABS16(x) do { \
1098 if (x == (int16_t)0x8000) { \
1099 x = 0x7fff; \
1100 SET_QC(); \
1101 } else if (x < 0) { \
1102 x = -x; \
1103 }} while (0)
HELPER(neon_qabs_s16)1104 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1105 {
1106 neon_s16 vec;
1107 NEON_UNPACK(neon_s16, vec, x);
1108 DO_QABS16(vec.v1);
1109 DO_QABS16(vec.v2);
1110 NEON_PACK(neon_s16, x, vec);
1111 return x;
1112 }
1113 #undef DO_QABS16
1114
1115 #define DO_QNEG16(x) do { \
1116 if (x == (int16_t)0x8000) { \
1117 x = 0x7fff; \
1118 SET_QC(); \
1119 } else { \
1120 x = -x; \
1121 }} while (0)
HELPER(neon_qneg_s16)1122 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1123 {
1124 neon_s16 vec;
1125 NEON_UNPACK(neon_s16, vec, x);
1126 DO_QNEG16(vec.v1);
1127 DO_QNEG16(vec.v2);
1128 NEON_PACK(neon_s16, x, vec);
1129 return x;
1130 }
1131 #undef DO_QNEG16
1132
HELPER(neon_qabs_s32)1133 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1134 {
1135 if (x == SIGNBIT) {
1136 SET_QC();
1137 x = ~SIGNBIT;
1138 } else if ((int32_t)x < 0) {
1139 x = -x;
1140 }
1141 return x;
1142 }
1143
HELPER(neon_qneg_s32)1144 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1145 {
1146 if (x == SIGNBIT) {
1147 SET_QC();
1148 x = ~SIGNBIT;
1149 } else {
1150 x = -x;
1151 }
1152 return x;
1153 }
1154
HELPER(neon_qabs_s64)1155 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1156 {
1157 if (x == SIGNBIT64) {
1158 SET_QC();
1159 x = ~SIGNBIT64;
1160 } else if ((int64_t)x < 0) {
1161 x = -x;
1162 }
1163 return x;
1164 }
1165
HELPER(neon_qneg_s64)1166 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1167 {
1168 if (x == SIGNBIT64) {
1169 SET_QC();
1170 x = ~SIGNBIT64;
1171 } else {
1172 x = -x;
1173 }
1174 return x;
1175 }
1176
1177 /* NEON Float helpers. */
1178
1179 /* Floating point comparisons produce an integer result.
1180 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1181 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1182 */
HELPER(neon_ceq_f32)1183 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, float_status *fpst)
1184 {
1185 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1186 }
1187
HELPER(neon_cge_f32)1188 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1189 {
1190 return -float32_le(make_float32(b), make_float32(a), fpst);
1191 }
1192
HELPER(neon_cgt_f32)1193 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1194 {
1195 return -float32_lt(make_float32(b), make_float32(a), fpst);
1196 }
1197
HELPER(neon_acge_f32)1198 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, float_status *fpst)
1199 {
1200 float32 f0 = float32_abs(make_float32(a));
1201 float32 f1 = float32_abs(make_float32(b));
1202 return -float32_le(f1, f0, fpst);
1203 }
1204
HELPER(neon_acgt_f32)1205 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, float_status *fpst)
1206 {
1207 float32 f0 = float32_abs(make_float32(a));
1208 float32 f1 = float32_abs(make_float32(b));
1209 return -float32_lt(f1, f0, fpst);
1210 }
1211
HELPER(neon_acge_f64)1212 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, float_status *fpst)
1213 {
1214 float64 f0 = float64_abs(make_float64(a));
1215 float64 f1 = float64_abs(make_float64(b));
1216 return -float64_le(f1, f0, fpst);
1217 }
1218
HELPER(neon_acgt_f64)1219 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, float_status *fpst)
1220 {
1221 float64 f0 = float64_abs(make_float64(a));
1222 float64 f1 = float64_abs(make_float64(b));
1223 return -float64_lt(f1, f0, fpst);
1224 }
1225
1226 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1227
HELPER(neon_qunzip8)1228 void HELPER(neon_qunzip8)(void *vd, void *vm)
1229 {
1230 uint64_t *rd = vd, *rm = vm;
1231 uint64_t zd0 = rd[0], zd1 = rd[1];
1232 uint64_t zm0 = rm[0], zm1 = rm[1];
1233
1234 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1235 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1236 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1237 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1238 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1239 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1240 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1241 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1242 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1243 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1244 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1245 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1246 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1247 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1248 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1249 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1250
1251 rm[0] = m0;
1252 rm[1] = m1;
1253 rd[0] = d0;
1254 rd[1] = d1;
1255 }
1256
HELPER(neon_qunzip16)1257 void HELPER(neon_qunzip16)(void *vd, void *vm)
1258 {
1259 uint64_t *rd = vd, *rm = vm;
1260 uint64_t zd0 = rd[0], zd1 = rd[1];
1261 uint64_t zm0 = rm[0], zm1 = rm[1];
1262
1263 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1264 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1265 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1266 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1267 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1268 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1269 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1270 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1271
1272 rm[0] = m0;
1273 rm[1] = m1;
1274 rd[0] = d0;
1275 rd[1] = d1;
1276 }
1277
HELPER(neon_qunzip32)1278 void HELPER(neon_qunzip32)(void *vd, void *vm)
1279 {
1280 uint64_t *rd = vd, *rm = vm;
1281 uint64_t zd0 = rd[0], zd1 = rd[1];
1282 uint64_t zm0 = rm[0], zm1 = rm[1];
1283
1284 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1285 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1286 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1287 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1288
1289 rm[0] = m0;
1290 rm[1] = m1;
1291 rd[0] = d0;
1292 rd[1] = d1;
1293 }
1294
HELPER(neon_unzip8)1295 void HELPER(neon_unzip8)(void *vd, void *vm)
1296 {
1297 uint64_t *rd = vd, *rm = vm;
1298 uint64_t zd = rd[0], zm = rm[0];
1299
1300 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1301 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1302 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1303 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1304 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1305 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1306 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1307 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1308
1309 rm[0] = m0;
1310 rd[0] = d0;
1311 }
1312
HELPER(neon_unzip16)1313 void HELPER(neon_unzip16)(void *vd, void *vm)
1314 {
1315 uint64_t *rd = vd, *rm = vm;
1316 uint64_t zd = rd[0], zm = rm[0];
1317
1318 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1319 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1320 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1321 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1322
1323 rm[0] = m0;
1324 rd[0] = d0;
1325 }
1326
HELPER(neon_qzip8)1327 void HELPER(neon_qzip8)(void *vd, void *vm)
1328 {
1329 uint64_t *rd = vd, *rm = vm;
1330 uint64_t zd0 = rd[0], zd1 = rd[1];
1331 uint64_t zm0 = rm[0], zm1 = rm[1];
1332
1333 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1334 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1335 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1336 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1337 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1338 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1339 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1340 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1341 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1342 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1343 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1344 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1345 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1346 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1347 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1348 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1349
1350 rm[0] = m0;
1351 rm[1] = m1;
1352 rd[0] = d0;
1353 rd[1] = d1;
1354 }
1355
HELPER(neon_qzip16)1356 void HELPER(neon_qzip16)(void *vd, void *vm)
1357 {
1358 uint64_t *rd = vd, *rm = vm;
1359 uint64_t zd0 = rd[0], zd1 = rd[1];
1360 uint64_t zm0 = rm[0], zm1 = rm[1];
1361
1362 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1363 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1364 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1365 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1366 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1367 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1368 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1369 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1370
1371 rm[0] = m0;
1372 rm[1] = m1;
1373 rd[0] = d0;
1374 rd[1] = d1;
1375 }
1376
HELPER(neon_qzip32)1377 void HELPER(neon_qzip32)(void *vd, void *vm)
1378 {
1379 uint64_t *rd = vd, *rm = vm;
1380 uint64_t zd0 = rd[0], zd1 = rd[1];
1381 uint64_t zm0 = rm[0], zm1 = rm[1];
1382
1383 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1384 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1385 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1386 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1387
1388 rm[0] = m0;
1389 rm[1] = m1;
1390 rd[0] = d0;
1391 rd[1] = d1;
1392 }
1393
HELPER(neon_zip8)1394 void HELPER(neon_zip8)(void *vd, void *vm)
1395 {
1396 uint64_t *rd = vd, *rm = vm;
1397 uint64_t zd = rd[0], zm = rm[0];
1398
1399 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1400 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1401 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1402 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1403 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1404 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1405 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1406 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1407
1408 rm[0] = m0;
1409 rd[0] = d0;
1410 }
1411
HELPER(neon_zip16)1412 void HELPER(neon_zip16)(void *vd, void *vm)
1413 {
1414 uint64_t *rd = vd, *rm = vm;
1415 uint64_t zd = rd[0], zm = rm[0];
1416
1417 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1418 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1419 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1420 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1421
1422 rm[0] = m0;
1423 rd[0] = d0;
1424 }
1425