1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "exec/helper-proto.h" 13 #include "tcg/tcg-gvec-desc.h" 14 #include "fpu/softfloat.h" 15 #include "vec_internal.h" 16 17 #define SIGNBIT (uint32_t)0x80000000 18 #define SIGNBIT64 ((uint64_t)1 << 63) 19 20 #define SET_QC() env->vfp.qc[0] = 1 21 22 #define NEON_TYPE1(name, type) \ 23 typedef struct \ 24 { \ 25 type v1; \ 26 } neon_##name; 27 #if HOST_BIG_ENDIAN 28 #define NEON_TYPE2(name, type) \ 29 typedef struct \ 30 { \ 31 type v2; \ 32 type v1; \ 33 } neon_##name; 34 #define NEON_TYPE4(name, type) \ 35 typedef struct \ 36 { \ 37 type v4; \ 38 type v3; \ 39 type v2; \ 40 type v1; \ 41 } neon_##name; 42 #else 43 #define NEON_TYPE2(name, type) \ 44 typedef struct \ 45 { \ 46 type v1; \ 47 type v2; \ 48 } neon_##name; 49 #define NEON_TYPE4(name, type) \ 50 typedef struct \ 51 { \ 52 type v1; \ 53 type v2; \ 54 type v3; \ 55 type v4; \ 56 } neon_##name; 57 #endif 58 59 NEON_TYPE4(s8, int8_t) 60 NEON_TYPE4(u8, uint8_t) 61 NEON_TYPE2(s16, int16_t) 62 NEON_TYPE2(u16, uint16_t) 63 NEON_TYPE1(s32, int32_t) 64 NEON_TYPE1(u32, uint32_t) 65 #undef NEON_TYPE4 66 #undef NEON_TYPE2 67 #undef NEON_TYPE1 68 69 /* Copy from a uint32_t to a vector structure type. */ 70 #define NEON_UNPACK(vtype, dest, val) do { \ 71 union { \ 72 vtype v; \ 73 uint32_t i; \ 74 } conv_u; \ 75 conv_u.i = (val); \ 76 dest = conv_u.v; \ 77 } while(0) 78 79 /* Copy from a vector structure type to a uint32_t. */ 80 #define NEON_PACK(vtype, dest, val) do { \ 81 union { \ 82 vtype v; \ 83 uint32_t i; \ 84 } conv_u; \ 85 conv_u.v = (val); \ 86 dest = conv_u.i; \ 87 } while(0) 88 89 #define NEON_DO1 \ 90 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 91 #define NEON_DO2 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 93 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 94 #define NEON_DO4 \ 95 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 96 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 97 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 98 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 99 100 #define NEON_VOP_BODY(vtype, n) \ 101 { \ 102 uint32_t res; \ 103 vtype vsrc1; \ 104 vtype vsrc2; \ 105 vtype vdest; \ 106 NEON_UNPACK(vtype, vsrc1, arg1); \ 107 NEON_UNPACK(vtype, vsrc2, arg2); \ 108 NEON_DO##n; \ 109 NEON_PACK(vtype, res, vdest); \ 110 return res; \ 111 } 112 113 #define NEON_VOP(name, vtype, n) \ 114 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 115 NEON_VOP_BODY(vtype, n) 116 117 #define NEON_VOP_ENV(name, vtype, n) \ 118 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 119 NEON_VOP_BODY(vtype, n) 120 121 #define NEON_GVEC_VOP2(name, vtype) \ 122 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 123 { \ 124 intptr_t i, opr_sz = simd_oprsz(desc); \ 125 vtype *d = vd, *n = vn, *m = vm; \ 126 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 127 NEON_FN(d[i], n[i], m[i]); \ 128 } \ 129 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 130 } 131 132 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 133 void HELPER(name)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) \ 134 { \ 135 intptr_t i, opr_sz = simd_oprsz(desc); \ 136 vtype *d = vd, *n = vn, *m = vm; \ 137 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 138 NEON_FN(d[i], n[i], m[i]); \ 139 } \ 140 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 141 } 142 143 #define NEON_GVEC_VOP2i_ENV(name, vtype) \ 144 void HELPER(name)(void *vd, void *vn, CPUARMState *env, uint32_t desc) \ 145 { \ 146 intptr_t i, opr_sz = simd_oprsz(desc); \ 147 int imm = simd_data(desc); \ 148 vtype *d = vd, *n = vn; \ 149 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 150 NEON_FN(d[i], n[i], imm); \ 151 } \ 152 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 153 } 154 155 /* Pairwise operations. */ 156 /* For 32-bit elements each segment only contains a single element, so 157 the elementwise and pairwise operations are the same. */ 158 #define NEON_PDO2 \ 159 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 160 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 161 #define NEON_PDO4 \ 162 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 163 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 164 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 165 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 166 167 #define NEON_POP(name, vtype, n) \ 168 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 169 { \ 170 uint32_t res; \ 171 vtype vsrc1; \ 172 vtype vsrc2; \ 173 vtype vdest; \ 174 NEON_UNPACK(vtype, vsrc1, arg1); \ 175 NEON_UNPACK(vtype, vsrc2, arg2); \ 176 NEON_PDO##n; \ 177 NEON_PACK(vtype, res, vdest); \ 178 return res; \ 179 } 180 181 /* Unary operators. */ 182 #define NEON_VOP1(name, vtype, n) \ 183 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 184 { \ 185 vtype vsrc1; \ 186 vtype vdest; \ 187 NEON_UNPACK(vtype, vsrc1, arg); \ 188 NEON_DO##n; \ 189 NEON_PACK(vtype, arg, vdest); \ 190 return arg; \ 191 } 192 193 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 194 NEON_POP(pmin_s8, neon_s8, 4) 195 NEON_POP(pmin_u8, neon_u8, 4) 196 NEON_POP(pmin_s16, neon_s16, 2) 197 NEON_POP(pmin_u16, neon_u16, 2) 198 #undef NEON_FN 199 200 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 201 NEON_POP(pmax_s8, neon_s8, 4) 202 NEON_POP(pmax_u8, neon_u8, 4) 203 NEON_POP(pmax_s16, neon_s16, 2) 204 NEON_POP(pmax_u16, neon_u16, 2) 205 #undef NEON_FN 206 207 #define NEON_FN(dest, src1, src2) \ 208 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 209 NEON_VOP(shl_u16, neon_u16, 2) 210 #undef NEON_FN 211 212 #define NEON_FN(dest, src1, src2) \ 213 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 214 NEON_VOP(shl_s16, neon_s16, 2) 215 #undef NEON_FN 216 217 #define NEON_FN(dest, src1, src2) \ 218 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 219 NEON_VOP(rshl_s8, neon_s8, 4) 220 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 221 #undef NEON_FN 222 223 #define NEON_FN(dest, src1, src2) \ 224 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 225 NEON_VOP(rshl_s16, neon_s16, 2) 226 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 227 #undef NEON_FN 228 229 #define NEON_FN(dest, src1, src2) \ 230 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 231 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 232 #undef NEON_FN 233 234 #define NEON_FN(dest, src1, src2) \ 235 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 236 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 237 #undef NEON_FN 238 239 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 240 { 241 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 242 } 243 244 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 245 { 246 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 247 } 248 249 #define NEON_FN(dest, src1, src2) \ 250 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 251 NEON_VOP(rshl_u8, neon_u8, 4) 252 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 253 #undef NEON_FN 254 255 #define NEON_FN(dest, src1, src2) \ 256 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 257 NEON_VOP(rshl_u16, neon_u16, 2) 258 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 259 #undef NEON_FN 260 261 #define NEON_FN(dest, src1, src2) \ 262 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 263 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 264 #undef NEON_FN 265 266 #define NEON_FN(dest, src1, src2) \ 267 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 268 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 269 #undef NEON_FN 270 271 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 272 { 273 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 274 } 275 276 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 277 { 278 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 279 } 280 281 #define NEON_FN(dest, src1, src2) \ 282 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 283 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 284 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 285 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t) 286 #undef NEON_FN 287 288 #define NEON_FN(dest, src1, src2) \ 289 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 290 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 291 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 292 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t) 293 #undef NEON_FN 294 295 #define NEON_FN(dest, src1, src2) \ 296 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 297 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 298 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t) 299 #undef NEON_FN 300 301 #define NEON_FN(dest, src1, src2) \ 302 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 303 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 304 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t) 305 #undef NEON_FN 306 307 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 308 { 309 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 310 } 311 312 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 313 { 314 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 315 } 316 317 #define NEON_FN(dest, src1, src2) \ 318 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 319 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 320 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 321 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t) 322 #undef NEON_FN 323 324 #define NEON_FN(dest, src1, src2) \ 325 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 326 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 327 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 328 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t) 329 #undef NEON_FN 330 331 #define NEON_FN(dest, src1, src2) \ 332 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 333 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 334 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t) 335 #undef NEON_FN 336 337 #define NEON_FN(dest, src1, src2) \ 338 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 339 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 340 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t) 341 #undef NEON_FN 342 343 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 344 { 345 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 346 } 347 348 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 349 { 350 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 351 } 352 353 #define NEON_FN(dest, src1, src2) \ 354 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 355 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 356 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t) 357 #undef NEON_FN 358 359 #define NEON_FN(dest, src1, src2) \ 360 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 361 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 362 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t) 363 #undef NEON_FN 364 365 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 366 { 367 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 368 } 369 370 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 371 { 372 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 373 } 374 375 #define NEON_FN(dest, src1, src2) \ 376 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 377 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t) 378 #undef NEON_FN 379 380 #define NEON_FN(dest, src1, src2) \ 381 (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 382 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t) 383 #undef NEON_FN 384 385 #define NEON_FN(dest, src1, src2) \ 386 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 387 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 388 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) 389 #undef NEON_FN 390 391 #define NEON_FN(dest, src1, src2) \ 392 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 393 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 394 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t) 395 #undef NEON_FN 396 397 #define NEON_FN(dest, src1, src2) \ 398 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 399 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t) 400 #undef NEON_FN 401 402 #define NEON_FN(dest, src1, src2) \ 403 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 404 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t) 405 #undef NEON_FN 406 407 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 408 { 409 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 410 } 411 412 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 413 { 414 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 415 } 416 417 #define NEON_FN(dest, src1, src2) \ 418 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 419 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 420 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t) 421 #undef NEON_FN 422 423 #define NEON_FN(dest, src1, src2) \ 424 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 425 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 426 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t) 427 #undef NEON_FN 428 429 #define NEON_FN(dest, src1, src2) \ 430 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 431 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t) 432 #undef NEON_FN 433 434 #define NEON_FN(dest, src1, src2) \ 435 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 436 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t) 437 #undef NEON_FN 438 439 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 440 { 441 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 442 } 443 444 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 445 { 446 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 447 } 448 449 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 450 { 451 uint32_t mask; 452 mask = (a ^ b) & 0x80808080u; 453 a &= ~0x80808080u; 454 b &= ~0x80808080u; 455 return (a + b) ^ mask; 456 } 457 458 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 459 { 460 uint32_t mask; 461 mask = (a ^ b) & 0x80008000u; 462 a &= ~0x80008000u; 463 b &= ~0x80008000u; 464 return (a + b) ^ mask; 465 } 466 467 #define NEON_FN(dest, src1, src2) dest = src1 - src2 468 NEON_VOP(sub_u8, neon_u8, 4) 469 NEON_VOP(sub_u16, neon_u16, 2) 470 #undef NEON_FN 471 472 #define NEON_FN(dest, src1, src2) dest = src1 * src2 473 NEON_VOP(mul_u8, neon_u8, 4) 474 NEON_VOP(mul_u16, neon_u16, 2) 475 #undef NEON_FN 476 477 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 478 NEON_VOP(tst_u8, neon_u8, 4) 479 NEON_VOP(tst_u16, neon_u16, 2) 480 NEON_VOP(tst_u32, neon_u32, 1) 481 #undef NEON_FN 482 483 /* Count Leading Sign/Zero Bits. */ 484 static inline int do_clz8(uint8_t x) 485 { 486 int n; 487 for (n = 8; x; n--) 488 x >>= 1; 489 return n; 490 } 491 492 static inline int do_clz16(uint16_t x) 493 { 494 int n; 495 for (n = 16; x; n--) 496 x >>= 1; 497 return n; 498 } 499 500 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 501 NEON_VOP1(clz_u8, neon_u8, 4) 502 #undef NEON_FN 503 504 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 505 NEON_VOP1(clz_u16, neon_u16, 2) 506 #undef NEON_FN 507 508 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 509 NEON_VOP1(cls_s8, neon_s8, 4) 510 #undef NEON_FN 511 512 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 513 NEON_VOP1(cls_s16, neon_s16, 2) 514 #undef NEON_FN 515 516 uint32_t HELPER(neon_cls_s32)(uint32_t x) 517 { 518 int count; 519 if ((int32_t)x < 0) 520 x = ~x; 521 for (count = 32; x; count--) 522 x = x >> 1; 523 return count - 1; 524 } 525 526 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 527 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 528 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 529 SET_QC(); \ 530 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 531 } else { \ 532 tmp <<= 1; \ 533 } \ 534 if (round) { \ 535 int32_t old = tmp; \ 536 tmp += 1 << 15; \ 537 if ((int32_t)tmp < old) { \ 538 SET_QC(); \ 539 tmp = SIGNBIT - 1; \ 540 } \ 541 } \ 542 dest = tmp >> 16; \ 543 } while(0) 544 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 545 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 546 #undef NEON_FN 547 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 548 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 549 #undef NEON_FN 550 #undef NEON_QDMULH16 551 552 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 553 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 554 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 555 SET_QC(); \ 556 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 557 } else { \ 558 tmp <<= 1; \ 559 } \ 560 if (round) { \ 561 int64_t old = tmp; \ 562 tmp += (int64_t)1 << 31; \ 563 if ((int64_t)tmp < old) { \ 564 SET_QC(); \ 565 tmp = SIGNBIT64 - 1; \ 566 } \ 567 } \ 568 dest = tmp >> 32; \ 569 } while(0) 570 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 571 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 572 #undef NEON_FN 573 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 574 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 575 #undef NEON_FN 576 #undef NEON_QDMULH32 577 578 /* Only the low 32-bits of output are significant. */ 579 uint64_t HELPER(neon_narrow_u8)(uint64_t x) 580 { 581 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 582 | ((x >> 24) & 0xff000000u); 583 } 584 585 /* Only the low 32-bits of output are significant. */ 586 uint64_t HELPER(neon_narrow_u16)(uint64_t x) 587 { 588 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 589 } 590 591 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 592 { 593 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 594 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 595 } 596 597 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 598 { 599 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 600 } 601 602 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 603 { 604 x &= 0xff80ff80ff80ff80ull; 605 x += 0x0080008000800080ull; 606 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 607 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 608 } 609 610 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 611 { 612 x &= 0xffff8000ffff8000ull; 613 x += 0x0000800000008000ull; 614 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 615 } 616 617 /* Only the low 32-bits of output are significant. */ 618 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 619 { 620 uint16_t s; 621 uint8_t d; 622 uint32_t res = 0; 623 #define SAT8(n) \ 624 s = x >> n; \ 625 if (s & 0x8000) { \ 626 SET_QC(); \ 627 } else { \ 628 if (s > 0xff) { \ 629 d = 0xff; \ 630 SET_QC(); \ 631 } else { \ 632 d = s; \ 633 } \ 634 res |= (uint32_t)d << (n / 2); \ 635 } 636 637 SAT8(0); 638 SAT8(16); 639 SAT8(32); 640 SAT8(48); 641 #undef SAT8 642 return res; 643 } 644 645 /* Only the low 32-bits of output are significant. */ 646 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 647 { 648 uint16_t s; 649 uint8_t d; 650 uint32_t res = 0; 651 #define SAT8(n) \ 652 s = x >> n; \ 653 if (s > 0xff) { \ 654 d = 0xff; \ 655 SET_QC(); \ 656 } else { \ 657 d = s; \ 658 } \ 659 res |= (uint32_t)d << (n / 2); 660 661 SAT8(0); 662 SAT8(16); 663 SAT8(32); 664 SAT8(48); 665 #undef SAT8 666 return res; 667 } 668 669 /* Only the low 32-bits of output are significant. */ 670 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 671 { 672 int16_t s; 673 uint8_t d; 674 uint32_t res = 0; 675 #define SAT8(n) \ 676 s = x >> n; \ 677 if (s != (int8_t)s) { \ 678 d = (s >> 15) ^ 0x7f; \ 679 SET_QC(); \ 680 } else { \ 681 d = s; \ 682 } \ 683 res |= (uint32_t)d << (n / 2); 684 685 SAT8(0); 686 SAT8(16); 687 SAT8(32); 688 SAT8(48); 689 #undef SAT8 690 return res; 691 } 692 693 /* Only the low 32-bits of output are significant. */ 694 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 695 { 696 uint32_t high; 697 uint32_t low; 698 low = x; 699 if (low & 0x80000000) { 700 low = 0; 701 SET_QC(); 702 } else if (low > 0xffff) { 703 low = 0xffff; 704 SET_QC(); 705 } 706 high = x >> 32; 707 if (high & 0x80000000) { 708 high = 0; 709 SET_QC(); 710 } else if (high > 0xffff) { 711 high = 0xffff; 712 SET_QC(); 713 } 714 return deposit32(low, 16, 16, high); 715 } 716 717 /* Only the low 32-bits of output are significant. */ 718 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 719 { 720 uint32_t high; 721 uint32_t low; 722 low = x; 723 if (low > 0xffff) { 724 low = 0xffff; 725 SET_QC(); 726 } 727 high = x >> 32; 728 if (high > 0xffff) { 729 high = 0xffff; 730 SET_QC(); 731 } 732 return deposit32(low, 16, 16, high); 733 } 734 735 /* Only the low 32-bits of output are significant. */ 736 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 737 { 738 int32_t low; 739 int32_t high; 740 low = x; 741 if (low != (int16_t)low) { 742 low = (low >> 31) ^ 0x7fff; 743 SET_QC(); 744 } 745 high = x >> 32; 746 if (high != (int16_t)high) { 747 high = (high >> 31) ^ 0x7fff; 748 SET_QC(); 749 } 750 return deposit32(low, 16, 16, high); 751 } 752 753 /* Only the low 32-bits of output are significant. */ 754 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 755 { 756 if (x & 0x8000000000000000ull) { 757 SET_QC(); 758 return 0; 759 } 760 if (x > 0xffffffffu) { 761 SET_QC(); 762 return 0xffffffffu; 763 } 764 return x; 765 } 766 767 /* Only the low 32-bits of output are significant. */ 768 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 769 { 770 if (x > 0xffffffffu) { 771 SET_QC(); 772 return 0xffffffffu; 773 } 774 return x; 775 } 776 777 /* Only the low 32-bits of output are significant. */ 778 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 779 { 780 if ((int64_t)x != (int32_t)x) { 781 SET_QC(); 782 return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff; 783 } 784 return (uint32_t)x; 785 } 786 787 uint64_t HELPER(neon_widen_u8)(uint32_t x) 788 { 789 uint64_t tmp; 790 uint64_t ret; 791 ret = (uint8_t)x; 792 tmp = (uint8_t)(x >> 8); 793 ret |= tmp << 16; 794 tmp = (uint8_t)(x >> 16); 795 ret |= tmp << 32; 796 tmp = (uint8_t)(x >> 24); 797 ret |= tmp << 48; 798 return ret; 799 } 800 801 uint64_t HELPER(neon_widen_s8)(uint32_t x) 802 { 803 uint64_t tmp; 804 uint64_t ret; 805 ret = (uint16_t)(int8_t)x; 806 tmp = (uint16_t)(int8_t)(x >> 8); 807 ret |= tmp << 16; 808 tmp = (uint16_t)(int8_t)(x >> 16); 809 ret |= tmp << 32; 810 tmp = (uint16_t)(int8_t)(x >> 24); 811 ret |= tmp << 48; 812 return ret; 813 } 814 815 uint64_t HELPER(neon_widen_u16)(uint32_t x) 816 { 817 uint64_t high = (uint16_t)(x >> 16); 818 return ((uint16_t)x) | (high << 32); 819 } 820 821 uint64_t HELPER(neon_widen_s16)(uint32_t x) 822 { 823 uint64_t high = (int16_t)(x >> 16); 824 return ((uint32_t)(int16_t)x) | (high << 32); 825 } 826 827 /* Pairwise long add: add pairs of adjacent elements into 828 * double-width elements in the result (eg _s8 is an 8x8->16 op) 829 */ 830 uint64_t HELPER(neon_addlp_s8)(uint64_t a) 831 { 832 uint64_t nsignmask = 0x0080008000800080ULL; 833 uint64_t wsignmask = 0x8000800080008000ULL; 834 uint64_t elementmask = 0x00ff00ff00ff00ffULL; 835 uint64_t tmp1, tmp2; 836 uint64_t res, signres; 837 838 /* Extract odd elements, sign extend each to a 16 bit field */ 839 tmp1 = a & elementmask; 840 tmp1 ^= nsignmask; 841 tmp1 |= wsignmask; 842 tmp1 = (tmp1 - nsignmask) ^ wsignmask; 843 /* Ditto for the even elements */ 844 tmp2 = (a >> 8) & elementmask; 845 tmp2 ^= nsignmask; 846 tmp2 |= wsignmask; 847 tmp2 = (tmp2 - nsignmask) ^ wsignmask; 848 849 /* calculate the result by summing bits 0..14, 16..22, etc, 850 * and then adjusting the sign bits 15, 23, etc manually. 851 * This ensures the addition can't overflow the 16 bit field. 852 */ 853 signres = (tmp1 ^ tmp2) & wsignmask; 854 res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask); 855 res ^= signres; 856 857 return res; 858 } 859 860 uint64_t HELPER(neon_addlp_s16)(uint64_t a) 861 { 862 int32_t reslo, reshi; 863 864 reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16); 865 reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48); 866 867 return (uint32_t)reslo | (((uint64_t)reshi) << 32); 868 } 869 870 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 871 { 872 uint32_t x, y; 873 uint32_t low, high; 874 875 x = a; 876 y = b; 877 low = x + y; 878 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 879 SET_QC(); 880 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 881 } 882 x = a >> 32; 883 y = b >> 32; 884 high = x + y; 885 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 886 SET_QC(); 887 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 888 } 889 return low | ((uint64_t)high << 32); 890 } 891 892 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 893 { 894 uint64_t result; 895 896 result = a + b; 897 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 898 SET_QC(); 899 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 900 } 901 return result; 902 } 903 904 /* We have to do the arithmetic in a larger type than 905 * the input type, because for example with a signed 32 bit 906 * op the absolute difference can overflow a signed 32 bit value. 907 */ 908 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 909 arithtype tmp_x = (intype)(x); \ 910 arithtype tmp_y = (intype)(y); \ 911 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 912 } while(0) 913 914 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 915 { 916 uint64_t tmp; 917 uint64_t result; 918 DO_ABD(result, a, b, uint8_t, uint32_t); 919 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 920 result |= tmp << 16; 921 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 922 result |= tmp << 32; 923 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 924 result |= tmp << 48; 925 return result; 926 } 927 928 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 929 { 930 uint64_t tmp; 931 uint64_t result; 932 DO_ABD(result, a, b, int8_t, int32_t); 933 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 934 result |= tmp << 16; 935 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 936 result |= tmp << 32; 937 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 938 result |= tmp << 48; 939 return result; 940 } 941 942 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 943 { 944 uint64_t tmp; 945 uint64_t result; 946 DO_ABD(result, a, b, uint16_t, uint32_t); 947 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 948 return result | (tmp << 32); 949 } 950 951 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 952 { 953 uint64_t tmp; 954 uint64_t result; 955 DO_ABD(result, a, b, int16_t, int32_t); 956 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 957 return result | (tmp << 32); 958 } 959 960 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 961 { 962 uint64_t result; 963 DO_ABD(result, a, b, uint32_t, uint64_t); 964 return result; 965 } 966 967 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 968 { 969 uint64_t result; 970 DO_ABD(result, a, b, int32_t, int64_t); 971 return result; 972 } 973 #undef DO_ABD 974 975 /* Widening multiply. Named type is the source type. */ 976 #define DO_MULL(dest, x, y, type1, type2) do { \ 977 type1 tmp_x = x; \ 978 type1 tmp_y = y; \ 979 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 980 } while(0) 981 982 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 983 { 984 uint64_t tmp; 985 uint64_t result; 986 987 DO_MULL(result, a, b, uint8_t, uint16_t); 988 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 989 result |= tmp << 16; 990 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 991 result |= tmp << 32; 992 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 993 result |= tmp << 48; 994 return result; 995 } 996 997 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 998 { 999 uint64_t tmp; 1000 uint64_t result; 1001 1002 DO_MULL(result, a, b, int8_t, uint16_t); 1003 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1004 result |= tmp << 16; 1005 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1006 result |= tmp << 32; 1007 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1008 result |= tmp << 48; 1009 return result; 1010 } 1011 1012 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1013 { 1014 uint64_t tmp; 1015 uint64_t result; 1016 1017 DO_MULL(result, a, b, uint16_t, uint32_t); 1018 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1019 return result | (tmp << 32); 1020 } 1021 1022 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1023 { 1024 uint64_t tmp; 1025 uint64_t result; 1026 1027 DO_MULL(result, a, b, int16_t, uint32_t); 1028 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1029 return result | (tmp << 32); 1030 } 1031 1032 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1033 { 1034 uint16_t tmp; 1035 uint64_t result; 1036 result = (uint16_t)-x; 1037 tmp = -(x >> 16); 1038 result |= (uint64_t)tmp << 16; 1039 tmp = -(x >> 32); 1040 result |= (uint64_t)tmp << 32; 1041 tmp = -(x >> 48); 1042 result |= (uint64_t)tmp << 48; 1043 return result; 1044 } 1045 1046 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1047 { 1048 uint32_t low = -x; 1049 uint32_t high = -(x >> 32); 1050 return low | ((uint64_t)high << 32); 1051 } 1052 1053 /* Saturating sign manipulation. */ 1054 /* ??? Make these use NEON_VOP1 */ 1055 #define DO_QABS8(x) do { \ 1056 if (x == (int8_t)0x80) { \ 1057 x = 0x7f; \ 1058 SET_QC(); \ 1059 } else if (x < 0) { \ 1060 x = -x; \ 1061 }} while (0) 1062 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1063 { 1064 neon_s8 vec; 1065 NEON_UNPACK(neon_s8, vec, x); 1066 DO_QABS8(vec.v1); 1067 DO_QABS8(vec.v2); 1068 DO_QABS8(vec.v3); 1069 DO_QABS8(vec.v4); 1070 NEON_PACK(neon_s8, x, vec); 1071 return x; 1072 } 1073 #undef DO_QABS8 1074 1075 #define DO_QNEG8(x) do { \ 1076 if (x == (int8_t)0x80) { \ 1077 x = 0x7f; \ 1078 SET_QC(); \ 1079 } else { \ 1080 x = -x; \ 1081 }} while (0) 1082 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1083 { 1084 neon_s8 vec; 1085 NEON_UNPACK(neon_s8, vec, x); 1086 DO_QNEG8(vec.v1); 1087 DO_QNEG8(vec.v2); 1088 DO_QNEG8(vec.v3); 1089 DO_QNEG8(vec.v4); 1090 NEON_PACK(neon_s8, x, vec); 1091 return x; 1092 } 1093 #undef DO_QNEG8 1094 1095 #define DO_QABS16(x) do { \ 1096 if (x == (int16_t)0x8000) { \ 1097 x = 0x7fff; \ 1098 SET_QC(); \ 1099 } else if (x < 0) { \ 1100 x = -x; \ 1101 }} while (0) 1102 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1103 { 1104 neon_s16 vec; 1105 NEON_UNPACK(neon_s16, vec, x); 1106 DO_QABS16(vec.v1); 1107 DO_QABS16(vec.v2); 1108 NEON_PACK(neon_s16, x, vec); 1109 return x; 1110 } 1111 #undef DO_QABS16 1112 1113 #define DO_QNEG16(x) do { \ 1114 if (x == (int16_t)0x8000) { \ 1115 x = 0x7fff; \ 1116 SET_QC(); \ 1117 } else { \ 1118 x = -x; \ 1119 }} while (0) 1120 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1121 { 1122 neon_s16 vec; 1123 NEON_UNPACK(neon_s16, vec, x); 1124 DO_QNEG16(vec.v1); 1125 DO_QNEG16(vec.v2); 1126 NEON_PACK(neon_s16, x, vec); 1127 return x; 1128 } 1129 #undef DO_QNEG16 1130 1131 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1132 { 1133 if (x == SIGNBIT) { 1134 SET_QC(); 1135 x = ~SIGNBIT; 1136 } else if ((int32_t)x < 0) { 1137 x = -x; 1138 } 1139 return x; 1140 } 1141 1142 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1143 { 1144 if (x == SIGNBIT) { 1145 SET_QC(); 1146 x = ~SIGNBIT; 1147 } else { 1148 x = -x; 1149 } 1150 return x; 1151 } 1152 1153 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1154 { 1155 if (x == SIGNBIT64) { 1156 SET_QC(); 1157 x = ~SIGNBIT64; 1158 } else if ((int64_t)x < 0) { 1159 x = -x; 1160 } 1161 return x; 1162 } 1163 1164 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1165 { 1166 if (x == SIGNBIT64) { 1167 SET_QC(); 1168 x = ~SIGNBIT64; 1169 } else { 1170 x = -x; 1171 } 1172 return x; 1173 } 1174 1175 /* NEON Float helpers. */ 1176 1177 /* Floating point comparisons produce an integer result. 1178 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1179 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1180 */ 1181 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, float_status *fpst) 1182 { 1183 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1184 } 1185 1186 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, float_status *fpst) 1187 { 1188 return -float32_le(make_float32(b), make_float32(a), fpst); 1189 } 1190 1191 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, float_status *fpst) 1192 { 1193 return -float32_lt(make_float32(b), make_float32(a), fpst); 1194 } 1195 1196 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, float_status *fpst) 1197 { 1198 float32 f0 = float32_abs(make_float32(a)); 1199 float32 f1 = float32_abs(make_float32(b)); 1200 return -float32_le(f1, f0, fpst); 1201 } 1202 1203 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, float_status *fpst) 1204 { 1205 float32 f0 = float32_abs(make_float32(a)); 1206 float32 f1 = float32_abs(make_float32(b)); 1207 return -float32_lt(f1, f0, fpst); 1208 } 1209 1210 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, float_status *fpst) 1211 { 1212 float64 f0 = float64_abs(make_float64(a)); 1213 float64 f1 = float64_abs(make_float64(b)); 1214 return -float64_le(f1, f0, fpst); 1215 } 1216 1217 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, float_status *fpst) 1218 { 1219 float64 f0 = float64_abs(make_float64(a)); 1220 float64 f1 = float64_abs(make_float64(b)); 1221 return -float64_lt(f1, f0, fpst); 1222 } 1223 1224 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1225 1226 void HELPER(neon_qunzip8)(void *vd, void *vm) 1227 { 1228 uint64_t *rd = vd, *rm = vm; 1229 uint64_t zd0 = rd[0], zd1 = rd[1]; 1230 uint64_t zm0 = rm[0], zm1 = rm[1]; 1231 1232 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1233 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1234 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1235 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1236 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1237 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1238 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1239 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1240 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1241 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1242 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1243 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1244 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1245 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1246 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1247 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1248 1249 rm[0] = m0; 1250 rm[1] = m1; 1251 rd[0] = d0; 1252 rd[1] = d1; 1253 } 1254 1255 void HELPER(neon_qunzip16)(void *vd, void *vm) 1256 { 1257 uint64_t *rd = vd, *rm = vm; 1258 uint64_t zd0 = rd[0], zd1 = rd[1]; 1259 uint64_t zm0 = rm[0], zm1 = rm[1]; 1260 1261 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1262 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1263 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1264 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1265 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1266 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1267 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1268 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1269 1270 rm[0] = m0; 1271 rm[1] = m1; 1272 rd[0] = d0; 1273 rd[1] = d1; 1274 } 1275 1276 void HELPER(neon_qunzip32)(void *vd, void *vm) 1277 { 1278 uint64_t *rd = vd, *rm = vm; 1279 uint64_t zd0 = rd[0], zd1 = rd[1]; 1280 uint64_t zm0 = rm[0], zm1 = rm[1]; 1281 1282 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1283 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1284 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1285 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1286 1287 rm[0] = m0; 1288 rm[1] = m1; 1289 rd[0] = d0; 1290 rd[1] = d1; 1291 } 1292 1293 void HELPER(neon_unzip8)(void *vd, void *vm) 1294 { 1295 uint64_t *rd = vd, *rm = vm; 1296 uint64_t zd = rd[0], zm = rm[0]; 1297 1298 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1299 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1300 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1301 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1302 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1303 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1304 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1305 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1306 1307 rm[0] = m0; 1308 rd[0] = d0; 1309 } 1310 1311 void HELPER(neon_unzip16)(void *vd, void *vm) 1312 { 1313 uint64_t *rd = vd, *rm = vm; 1314 uint64_t zd = rd[0], zm = rm[0]; 1315 1316 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1317 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1318 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1319 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1320 1321 rm[0] = m0; 1322 rd[0] = d0; 1323 } 1324 1325 void HELPER(neon_qzip8)(void *vd, void *vm) 1326 { 1327 uint64_t *rd = vd, *rm = vm; 1328 uint64_t zd0 = rd[0], zd1 = rd[1]; 1329 uint64_t zm0 = rm[0], zm1 = rm[1]; 1330 1331 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1332 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1333 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1334 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1335 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1336 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1337 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1338 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1339 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1340 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1341 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1342 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1343 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1344 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1345 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1346 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1347 1348 rm[0] = m0; 1349 rm[1] = m1; 1350 rd[0] = d0; 1351 rd[1] = d1; 1352 } 1353 1354 void HELPER(neon_qzip16)(void *vd, void *vm) 1355 { 1356 uint64_t *rd = vd, *rm = vm; 1357 uint64_t zd0 = rd[0], zd1 = rd[1]; 1358 uint64_t zm0 = rm[0], zm1 = rm[1]; 1359 1360 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1361 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1362 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1363 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1364 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1365 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1366 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1367 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1368 1369 rm[0] = m0; 1370 rm[1] = m1; 1371 rd[0] = d0; 1372 rd[1] = d1; 1373 } 1374 1375 void HELPER(neon_qzip32)(void *vd, void *vm) 1376 { 1377 uint64_t *rd = vd, *rm = vm; 1378 uint64_t zd0 = rd[0], zd1 = rd[1]; 1379 uint64_t zm0 = rm[0], zm1 = rm[1]; 1380 1381 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1382 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1383 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1384 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1385 1386 rm[0] = m0; 1387 rm[1] = m1; 1388 rd[0] = d0; 1389 rd[1] = d1; 1390 } 1391 1392 void HELPER(neon_zip8)(void *vd, void *vm) 1393 { 1394 uint64_t *rd = vd, *rm = vm; 1395 uint64_t zd = rd[0], zm = rm[0]; 1396 1397 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1398 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1399 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1400 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1401 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1402 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1403 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1404 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1405 1406 rm[0] = m0; 1407 rd[0] = d0; 1408 } 1409 1410 void HELPER(neon_zip16)(void *vd, void *vm) 1411 { 1412 uint64_t *rd = vd, *rm = vm; 1413 uint64_t zd = rd[0], zm = rm[0]; 1414 1415 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1416 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1417 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1418 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1419 1420 rm[0] = m0; 1421 rd[0] = d0; 1422 } 1423