1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "tcg/tcg-gvec-desc.h" 13 #include "fpu/softfloat.h" 14 #include "vec_internal.h" 15 16 #define HELPER_H "tcg/helper.h" 17 #include "exec/helper-proto.h.inc" 18 19 #define SIGNBIT (uint32_t)0x80000000 20 #define SIGNBIT64 ((uint64_t)1 << 63) 21 22 #define SET_QC() env->vfp.qc[0] = 1 23 24 #define NEON_TYPE1(name, type) \ 25 typedef struct \ 26 { \ 27 type v1; \ 28 } neon_##name; 29 #if HOST_BIG_ENDIAN 30 #define NEON_TYPE2(name, type) \ 31 typedef struct \ 32 { \ 33 type v2; \ 34 type v1; \ 35 } neon_##name; 36 #define NEON_TYPE4(name, type) \ 37 typedef struct \ 38 { \ 39 type v4; \ 40 type v3; \ 41 type v2; \ 42 type v1; \ 43 } neon_##name; 44 #else 45 #define NEON_TYPE2(name, type) \ 46 typedef struct \ 47 { \ 48 type v1; \ 49 type v2; \ 50 } neon_##name; 51 #define NEON_TYPE4(name, type) \ 52 typedef struct \ 53 { \ 54 type v1; \ 55 type v2; \ 56 type v3; \ 57 type v4; \ 58 } neon_##name; 59 #endif 60 61 NEON_TYPE4(s8, int8_t) 62 NEON_TYPE4(u8, uint8_t) 63 NEON_TYPE2(s16, int16_t) 64 NEON_TYPE2(u16, uint16_t) 65 NEON_TYPE1(s32, int32_t) 66 NEON_TYPE1(u32, uint32_t) 67 #undef NEON_TYPE4 68 #undef NEON_TYPE2 69 #undef NEON_TYPE1 70 71 /* Copy from a uint32_t to a vector structure type. */ 72 #define NEON_UNPACK(vtype, dest, val) do { \ 73 union { \ 74 vtype v; \ 75 uint32_t i; \ 76 } conv_u; \ 77 conv_u.i = (val); \ 78 dest = conv_u.v; \ 79 } while(0) 80 81 /* Copy from a vector structure type to a uint32_t. */ 82 #define NEON_PACK(vtype, dest, val) do { \ 83 union { \ 84 vtype v; \ 85 uint32_t i; \ 86 } conv_u; \ 87 conv_u.v = (val); \ 88 dest = conv_u.i; \ 89 } while(0) 90 91 #define NEON_DO1 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 93 #define NEON_DO2 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 96 #define NEON_DO4 \ 97 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 98 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 99 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 100 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 101 102 #define NEON_VOP_BODY(vtype, n) \ 103 { \ 104 uint32_t res; \ 105 vtype vsrc1; \ 106 vtype vsrc2; \ 107 vtype vdest; \ 108 NEON_UNPACK(vtype, vsrc1, arg1); \ 109 NEON_UNPACK(vtype, vsrc2, arg2); \ 110 NEON_DO##n; \ 111 NEON_PACK(vtype, res, vdest); \ 112 return res; \ 113 } 114 115 #define NEON_VOP(name, vtype, n) \ 116 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 117 NEON_VOP_BODY(vtype, n) 118 119 #define NEON_VOP_ENV(name, vtype, n) \ 120 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 121 NEON_VOP_BODY(vtype, n) 122 123 #define NEON_GVEC_VOP2(name, vtype) \ 124 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 125 { \ 126 intptr_t i, opr_sz = simd_oprsz(desc); \ 127 vtype *d = vd, *n = vn, *m = vm; \ 128 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 129 NEON_FN(d[i], n[i], m[i]); \ 130 } \ 131 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 132 } 133 134 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 135 void HELPER(name)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) \ 136 { \ 137 intptr_t i, opr_sz = simd_oprsz(desc); \ 138 vtype *d = vd, *n = vn, *m = vm; \ 139 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 140 NEON_FN(d[i], n[i], m[i]); \ 141 } \ 142 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 143 } 144 145 #define NEON_GVEC_VOP2i_ENV(name, vtype) \ 146 void HELPER(name)(void *vd, void *vn, CPUARMState *env, uint32_t desc) \ 147 { \ 148 intptr_t i, opr_sz = simd_oprsz(desc); \ 149 int imm = simd_data(desc); \ 150 vtype *d = vd, *n = vn; \ 151 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 152 NEON_FN(d[i], n[i], imm); \ 153 } \ 154 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 155 } 156 157 /* Pairwise operations. */ 158 /* For 32-bit elements each segment only contains a single element, so 159 the elementwise and pairwise operations are the same. */ 160 #define NEON_PDO2 \ 161 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 162 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 163 #define NEON_PDO4 \ 164 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 165 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 166 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 167 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 168 169 #define NEON_POP(name, vtype, n) \ 170 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 171 { \ 172 uint32_t res; \ 173 vtype vsrc1; \ 174 vtype vsrc2; \ 175 vtype vdest; \ 176 NEON_UNPACK(vtype, vsrc1, arg1); \ 177 NEON_UNPACK(vtype, vsrc2, arg2); \ 178 NEON_PDO##n; \ 179 NEON_PACK(vtype, res, vdest); \ 180 return res; \ 181 } 182 183 /* Unary operators. */ 184 #define NEON_VOP1(name, vtype, n) \ 185 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 186 { \ 187 vtype vsrc1; \ 188 vtype vdest; \ 189 NEON_UNPACK(vtype, vsrc1, arg); \ 190 NEON_DO##n; \ 191 NEON_PACK(vtype, arg, vdest); \ 192 return arg; \ 193 } 194 195 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 196 NEON_POP(pmin_s8, neon_s8, 4) 197 NEON_POP(pmin_u8, neon_u8, 4) 198 NEON_POP(pmin_s16, neon_s16, 2) 199 NEON_POP(pmin_u16, neon_u16, 2) 200 #undef NEON_FN 201 202 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 203 NEON_POP(pmax_s8, neon_s8, 4) 204 NEON_POP(pmax_u8, neon_u8, 4) 205 NEON_POP(pmax_s16, neon_s16, 2) 206 NEON_POP(pmax_u16, neon_u16, 2) 207 #undef NEON_FN 208 209 #define NEON_FN(dest, src1, src2) \ 210 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 211 NEON_VOP(shl_u16, neon_u16, 2) 212 #undef NEON_FN 213 214 #define NEON_FN(dest, src1, src2) \ 215 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 216 NEON_VOP(shl_s16, neon_s16, 2) 217 #undef NEON_FN 218 219 #define NEON_FN(dest, src1, src2) \ 220 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 221 NEON_VOP(rshl_s8, neon_s8, 4) 222 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 223 #undef NEON_FN 224 225 #define NEON_FN(dest, src1, src2) \ 226 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 227 NEON_VOP(rshl_s16, neon_s16, 2) 228 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 229 #undef NEON_FN 230 231 #define NEON_FN(dest, src1, src2) \ 232 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 233 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 234 #undef NEON_FN 235 236 #define NEON_FN(dest, src1, src2) \ 237 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 238 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 239 #undef NEON_FN 240 241 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 242 { 243 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 244 } 245 246 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 247 { 248 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 249 } 250 251 #define NEON_FN(dest, src1, src2) \ 252 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 253 NEON_VOP(rshl_u8, neon_u8, 4) 254 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 255 #undef NEON_FN 256 257 #define NEON_FN(dest, src1, src2) \ 258 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 259 NEON_VOP(rshl_u16, neon_u16, 2) 260 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 261 #undef NEON_FN 262 263 #define NEON_FN(dest, src1, src2) \ 264 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 265 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 266 #undef NEON_FN 267 268 #define NEON_FN(dest, src1, src2) \ 269 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 270 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 271 #undef NEON_FN 272 273 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 274 { 275 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 276 } 277 278 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 279 { 280 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 281 } 282 283 #define NEON_FN(dest, src1, src2) \ 284 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 285 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 286 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 287 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t) 288 #undef NEON_FN 289 290 #define NEON_FN(dest, src1, src2) \ 291 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 292 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 293 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 294 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t) 295 #undef NEON_FN 296 297 #define NEON_FN(dest, src1, src2) \ 298 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 299 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 300 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t) 301 #undef NEON_FN 302 303 #define NEON_FN(dest, src1, src2) \ 304 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 305 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 306 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t) 307 #undef NEON_FN 308 309 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 310 { 311 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 312 } 313 314 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 315 { 316 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 317 } 318 319 #define NEON_FN(dest, src1, src2) \ 320 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 321 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 322 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 323 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t) 324 #undef NEON_FN 325 326 #define NEON_FN(dest, src1, src2) \ 327 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 328 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 329 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 330 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t) 331 #undef NEON_FN 332 333 #define NEON_FN(dest, src1, src2) \ 334 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 335 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 336 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t) 337 #undef NEON_FN 338 339 #define NEON_FN(dest, src1, src2) \ 340 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 341 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 342 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t) 343 #undef NEON_FN 344 345 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 346 { 347 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 348 } 349 350 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 351 { 352 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 353 } 354 355 #define NEON_FN(dest, src1, src2) \ 356 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 357 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 358 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t) 359 #undef NEON_FN 360 361 #define NEON_FN(dest, src1, src2) \ 362 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 363 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 364 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t) 365 #undef NEON_FN 366 367 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 368 { 369 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 370 } 371 372 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 373 { 374 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 375 } 376 377 #define NEON_FN(dest, src1, src2) \ 378 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 379 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t) 380 #undef NEON_FN 381 382 #define NEON_FN(dest, src1, src2) \ 383 (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 384 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t) 385 #undef NEON_FN 386 387 #define NEON_FN(dest, src1, src2) \ 388 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 389 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 390 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) 391 #undef NEON_FN 392 393 #define NEON_FN(dest, src1, src2) \ 394 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 395 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 396 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t) 397 #undef NEON_FN 398 399 #define NEON_FN(dest, src1, src2) \ 400 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 401 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t) 402 #undef NEON_FN 403 404 #define NEON_FN(dest, src1, src2) \ 405 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 406 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t) 407 #undef NEON_FN 408 409 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 410 { 411 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 412 } 413 414 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 415 { 416 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 417 } 418 419 #define NEON_FN(dest, src1, src2) \ 420 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 421 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 422 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t) 423 #undef NEON_FN 424 425 #define NEON_FN(dest, src1, src2) \ 426 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 427 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 428 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t) 429 #undef NEON_FN 430 431 #define NEON_FN(dest, src1, src2) \ 432 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 433 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t) 434 #undef NEON_FN 435 436 #define NEON_FN(dest, src1, src2) \ 437 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 438 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t) 439 #undef NEON_FN 440 441 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 442 { 443 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 444 } 445 446 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 447 { 448 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 449 } 450 451 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 452 { 453 uint32_t mask; 454 mask = (a ^ b) & 0x80808080u; 455 a &= ~0x80808080u; 456 b &= ~0x80808080u; 457 return (a + b) ^ mask; 458 } 459 460 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 461 { 462 uint32_t mask; 463 mask = (a ^ b) & 0x80008000u; 464 a &= ~0x80008000u; 465 b &= ~0x80008000u; 466 return (a + b) ^ mask; 467 } 468 469 #define NEON_FN(dest, src1, src2) dest = src1 - src2 470 NEON_VOP(sub_u8, neon_u8, 4) 471 NEON_VOP(sub_u16, neon_u16, 2) 472 #undef NEON_FN 473 474 #define NEON_FN(dest, src1, src2) dest = src1 * src2 475 NEON_VOP(mul_u8, neon_u8, 4) 476 NEON_VOP(mul_u16, neon_u16, 2) 477 #undef NEON_FN 478 479 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 480 NEON_VOP(tst_u8, neon_u8, 4) 481 NEON_VOP(tst_u16, neon_u16, 2) 482 NEON_VOP(tst_u32, neon_u32, 1) 483 #undef NEON_FN 484 485 /* Count Leading Sign/Zero Bits. */ 486 static inline int do_clz8(uint8_t x) 487 { 488 int n; 489 for (n = 8; x; n--) 490 x >>= 1; 491 return n; 492 } 493 494 static inline int do_clz16(uint16_t x) 495 { 496 int n; 497 for (n = 16; x; n--) 498 x >>= 1; 499 return n; 500 } 501 502 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 503 NEON_VOP1(clz_u8, neon_u8, 4) 504 #undef NEON_FN 505 506 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 507 NEON_VOP1(clz_u16, neon_u16, 2) 508 #undef NEON_FN 509 510 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 511 NEON_VOP1(cls_s8, neon_s8, 4) 512 #undef NEON_FN 513 514 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 515 NEON_VOP1(cls_s16, neon_s16, 2) 516 #undef NEON_FN 517 518 uint32_t HELPER(neon_cls_s32)(uint32_t x) 519 { 520 int count; 521 if ((int32_t)x < 0) 522 x = ~x; 523 for (count = 32; x; count--) 524 x = x >> 1; 525 return count - 1; 526 } 527 528 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 529 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 530 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 531 SET_QC(); \ 532 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 533 } else { \ 534 tmp <<= 1; \ 535 } \ 536 if (round) { \ 537 int32_t old = tmp; \ 538 tmp += 1 << 15; \ 539 if ((int32_t)tmp < old) { \ 540 SET_QC(); \ 541 tmp = SIGNBIT - 1; \ 542 } \ 543 } \ 544 dest = tmp >> 16; \ 545 } while(0) 546 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 547 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 548 #undef NEON_FN 549 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 550 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 551 #undef NEON_FN 552 #undef NEON_QDMULH16 553 554 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 555 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 556 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 557 SET_QC(); \ 558 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 559 } else { \ 560 tmp <<= 1; \ 561 } \ 562 if (round) { \ 563 int64_t old = tmp; \ 564 tmp += (int64_t)1 << 31; \ 565 if ((int64_t)tmp < old) { \ 566 SET_QC(); \ 567 tmp = SIGNBIT64 - 1; \ 568 } \ 569 } \ 570 dest = tmp >> 32; \ 571 } while(0) 572 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 573 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 574 #undef NEON_FN 575 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 576 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 577 #undef NEON_FN 578 #undef NEON_QDMULH32 579 580 /* Only the low 32-bits of output are significant. */ 581 uint64_t HELPER(neon_narrow_u8)(uint64_t x) 582 { 583 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 584 | ((x >> 24) & 0xff000000u); 585 } 586 587 /* Only the low 32-bits of output are significant. */ 588 uint64_t HELPER(neon_narrow_u16)(uint64_t x) 589 { 590 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 591 } 592 593 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 594 { 595 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 596 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 597 } 598 599 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 600 { 601 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 602 } 603 604 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 605 { 606 x &= 0xff80ff80ff80ff80ull; 607 x += 0x0080008000800080ull; 608 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 609 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 610 } 611 612 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 613 { 614 x &= 0xffff8000ffff8000ull; 615 x += 0x0000800000008000ull; 616 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 617 } 618 619 /* Only the low 32-bits of output are significant. */ 620 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 621 { 622 uint16_t s; 623 uint8_t d; 624 uint32_t res = 0; 625 #define SAT8(n) \ 626 s = x >> n; \ 627 if (s & 0x8000) { \ 628 SET_QC(); \ 629 } else { \ 630 if (s > 0xff) { \ 631 d = 0xff; \ 632 SET_QC(); \ 633 } else { \ 634 d = s; \ 635 } \ 636 res |= (uint32_t)d << (n / 2); \ 637 } 638 639 SAT8(0); 640 SAT8(16); 641 SAT8(32); 642 SAT8(48); 643 #undef SAT8 644 return res; 645 } 646 647 /* Only the low 32-bits of output are significant. */ 648 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 649 { 650 uint16_t s; 651 uint8_t d; 652 uint32_t res = 0; 653 #define SAT8(n) \ 654 s = x >> n; \ 655 if (s > 0xff) { \ 656 d = 0xff; \ 657 SET_QC(); \ 658 } else { \ 659 d = s; \ 660 } \ 661 res |= (uint32_t)d << (n / 2); 662 663 SAT8(0); 664 SAT8(16); 665 SAT8(32); 666 SAT8(48); 667 #undef SAT8 668 return res; 669 } 670 671 /* Only the low 32-bits of output are significant. */ 672 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 673 { 674 int16_t s; 675 uint8_t d; 676 uint32_t res = 0; 677 #define SAT8(n) \ 678 s = x >> n; \ 679 if (s != (int8_t)s) { \ 680 d = (s >> 15) ^ 0x7f; \ 681 SET_QC(); \ 682 } else { \ 683 d = s; \ 684 } \ 685 res |= (uint32_t)d << (n / 2); 686 687 SAT8(0); 688 SAT8(16); 689 SAT8(32); 690 SAT8(48); 691 #undef SAT8 692 return res; 693 } 694 695 /* Only the low 32-bits of output are significant. */ 696 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 697 { 698 uint32_t high; 699 uint32_t low; 700 low = x; 701 if (low & 0x80000000) { 702 low = 0; 703 SET_QC(); 704 } else if (low > 0xffff) { 705 low = 0xffff; 706 SET_QC(); 707 } 708 high = x >> 32; 709 if (high & 0x80000000) { 710 high = 0; 711 SET_QC(); 712 } else if (high > 0xffff) { 713 high = 0xffff; 714 SET_QC(); 715 } 716 return deposit32(low, 16, 16, high); 717 } 718 719 /* Only the low 32-bits of output are significant. */ 720 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 721 { 722 uint32_t high; 723 uint32_t low; 724 low = x; 725 if (low > 0xffff) { 726 low = 0xffff; 727 SET_QC(); 728 } 729 high = x >> 32; 730 if (high > 0xffff) { 731 high = 0xffff; 732 SET_QC(); 733 } 734 return deposit32(low, 16, 16, high); 735 } 736 737 /* Only the low 32-bits of output are significant. */ 738 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 739 { 740 int32_t low; 741 int32_t high; 742 low = x; 743 if (low != (int16_t)low) { 744 low = (low >> 31) ^ 0x7fff; 745 SET_QC(); 746 } 747 high = x >> 32; 748 if (high != (int16_t)high) { 749 high = (high >> 31) ^ 0x7fff; 750 SET_QC(); 751 } 752 return deposit32(low, 16, 16, high); 753 } 754 755 /* Only the low 32-bits of output are significant. */ 756 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 757 { 758 if (x & 0x8000000000000000ull) { 759 SET_QC(); 760 return 0; 761 } 762 if (x > 0xffffffffu) { 763 SET_QC(); 764 return 0xffffffffu; 765 } 766 return x; 767 } 768 769 /* Only the low 32-bits of output are significant. */ 770 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 771 { 772 if (x > 0xffffffffu) { 773 SET_QC(); 774 return 0xffffffffu; 775 } 776 return x; 777 } 778 779 /* Only the low 32-bits of output are significant. */ 780 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 781 { 782 if ((int64_t)x != (int32_t)x) { 783 SET_QC(); 784 return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff; 785 } 786 return (uint32_t)x; 787 } 788 789 uint64_t HELPER(neon_widen_u8)(uint32_t x) 790 { 791 uint64_t tmp; 792 uint64_t ret; 793 ret = (uint8_t)x; 794 tmp = (uint8_t)(x >> 8); 795 ret |= tmp << 16; 796 tmp = (uint8_t)(x >> 16); 797 ret |= tmp << 32; 798 tmp = (uint8_t)(x >> 24); 799 ret |= tmp << 48; 800 return ret; 801 } 802 803 uint64_t HELPER(neon_widen_s8)(uint32_t x) 804 { 805 uint64_t tmp; 806 uint64_t ret; 807 ret = (uint16_t)(int8_t)x; 808 tmp = (uint16_t)(int8_t)(x >> 8); 809 ret |= tmp << 16; 810 tmp = (uint16_t)(int8_t)(x >> 16); 811 ret |= tmp << 32; 812 tmp = (uint16_t)(int8_t)(x >> 24); 813 ret |= tmp << 48; 814 return ret; 815 } 816 817 uint64_t HELPER(neon_widen_u16)(uint32_t x) 818 { 819 uint64_t high = (uint16_t)(x >> 16); 820 return ((uint16_t)x) | (high << 32); 821 } 822 823 uint64_t HELPER(neon_widen_s16)(uint32_t x) 824 { 825 uint64_t high = (int16_t)(x >> 16); 826 return ((uint32_t)(int16_t)x) | (high << 32); 827 } 828 829 /* Pairwise long add: add pairs of adjacent elements into 830 * double-width elements in the result (eg _s8 is an 8x8->16 op) 831 */ 832 uint64_t HELPER(neon_addlp_s8)(uint64_t a) 833 { 834 uint64_t nsignmask = 0x0080008000800080ULL; 835 uint64_t wsignmask = 0x8000800080008000ULL; 836 uint64_t elementmask = 0x00ff00ff00ff00ffULL; 837 uint64_t tmp1, tmp2; 838 uint64_t res, signres; 839 840 /* Extract odd elements, sign extend each to a 16 bit field */ 841 tmp1 = a & elementmask; 842 tmp1 ^= nsignmask; 843 tmp1 |= wsignmask; 844 tmp1 = (tmp1 - nsignmask) ^ wsignmask; 845 /* Ditto for the even elements */ 846 tmp2 = (a >> 8) & elementmask; 847 tmp2 ^= nsignmask; 848 tmp2 |= wsignmask; 849 tmp2 = (tmp2 - nsignmask) ^ wsignmask; 850 851 /* calculate the result by summing bits 0..14, 16..22, etc, 852 * and then adjusting the sign bits 15, 23, etc manually. 853 * This ensures the addition can't overflow the 16 bit field. 854 */ 855 signres = (tmp1 ^ tmp2) & wsignmask; 856 res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask); 857 res ^= signres; 858 859 return res; 860 } 861 862 uint64_t HELPER(neon_addlp_s16)(uint64_t a) 863 { 864 int32_t reslo, reshi; 865 866 reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16); 867 reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48); 868 869 return (uint32_t)reslo | (((uint64_t)reshi) << 32); 870 } 871 872 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 873 { 874 uint32_t x, y; 875 uint32_t low, high; 876 877 x = a; 878 y = b; 879 low = x + y; 880 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 881 SET_QC(); 882 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 883 } 884 x = a >> 32; 885 y = b >> 32; 886 high = x + y; 887 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 888 SET_QC(); 889 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 890 } 891 return low | ((uint64_t)high << 32); 892 } 893 894 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 895 { 896 uint64_t result; 897 898 result = a + b; 899 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 900 SET_QC(); 901 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 902 } 903 return result; 904 } 905 906 /* We have to do the arithmetic in a larger type than 907 * the input type, because for example with a signed 32 bit 908 * op the absolute difference can overflow a signed 32 bit value. 909 */ 910 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 911 arithtype tmp_x = (intype)(x); \ 912 arithtype tmp_y = (intype)(y); \ 913 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 914 } while(0) 915 916 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 917 { 918 uint64_t tmp; 919 uint64_t result; 920 DO_ABD(result, a, b, uint8_t, uint32_t); 921 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 922 result |= tmp << 16; 923 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 924 result |= tmp << 32; 925 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 926 result |= tmp << 48; 927 return result; 928 } 929 930 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 931 { 932 uint64_t tmp; 933 uint64_t result; 934 DO_ABD(result, a, b, int8_t, int32_t); 935 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 936 result |= tmp << 16; 937 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 938 result |= tmp << 32; 939 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 940 result |= tmp << 48; 941 return result; 942 } 943 944 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 945 { 946 uint64_t tmp; 947 uint64_t result; 948 DO_ABD(result, a, b, uint16_t, uint32_t); 949 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 950 return result | (tmp << 32); 951 } 952 953 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 954 { 955 uint64_t tmp; 956 uint64_t result; 957 DO_ABD(result, a, b, int16_t, int32_t); 958 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 959 return result | (tmp << 32); 960 } 961 962 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 963 { 964 uint64_t result; 965 DO_ABD(result, a, b, uint32_t, uint64_t); 966 return result; 967 } 968 969 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 970 { 971 uint64_t result; 972 DO_ABD(result, a, b, int32_t, int64_t); 973 return result; 974 } 975 #undef DO_ABD 976 977 /* Widening multiply. Named type is the source type. */ 978 #define DO_MULL(dest, x, y, type1, type2) do { \ 979 type1 tmp_x = x; \ 980 type1 tmp_y = y; \ 981 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 982 } while(0) 983 984 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 985 { 986 uint64_t tmp; 987 uint64_t result; 988 989 DO_MULL(result, a, b, uint8_t, uint16_t); 990 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 991 result |= tmp << 16; 992 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 993 result |= tmp << 32; 994 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 995 result |= tmp << 48; 996 return result; 997 } 998 999 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1000 { 1001 uint64_t tmp; 1002 uint64_t result; 1003 1004 DO_MULL(result, a, b, int8_t, uint16_t); 1005 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1006 result |= tmp << 16; 1007 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1008 result |= tmp << 32; 1009 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1010 result |= tmp << 48; 1011 return result; 1012 } 1013 1014 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1015 { 1016 uint64_t tmp; 1017 uint64_t result; 1018 1019 DO_MULL(result, a, b, uint16_t, uint32_t); 1020 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1021 return result | (tmp << 32); 1022 } 1023 1024 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1025 { 1026 uint64_t tmp; 1027 uint64_t result; 1028 1029 DO_MULL(result, a, b, int16_t, uint32_t); 1030 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1031 return result | (tmp << 32); 1032 } 1033 1034 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1035 { 1036 uint16_t tmp; 1037 uint64_t result; 1038 result = (uint16_t)-x; 1039 tmp = -(x >> 16); 1040 result |= (uint64_t)tmp << 16; 1041 tmp = -(x >> 32); 1042 result |= (uint64_t)tmp << 32; 1043 tmp = -(x >> 48); 1044 result |= (uint64_t)tmp << 48; 1045 return result; 1046 } 1047 1048 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1049 { 1050 uint32_t low = -x; 1051 uint32_t high = -(x >> 32); 1052 return low | ((uint64_t)high << 32); 1053 } 1054 1055 /* Saturating sign manipulation. */ 1056 /* ??? Make these use NEON_VOP1 */ 1057 #define DO_QABS8(x) do { \ 1058 if (x == (int8_t)0x80) { \ 1059 x = 0x7f; \ 1060 SET_QC(); \ 1061 } else if (x < 0) { \ 1062 x = -x; \ 1063 }} while (0) 1064 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1065 { 1066 neon_s8 vec; 1067 NEON_UNPACK(neon_s8, vec, x); 1068 DO_QABS8(vec.v1); 1069 DO_QABS8(vec.v2); 1070 DO_QABS8(vec.v3); 1071 DO_QABS8(vec.v4); 1072 NEON_PACK(neon_s8, x, vec); 1073 return x; 1074 } 1075 #undef DO_QABS8 1076 1077 #define DO_QNEG8(x) do { \ 1078 if (x == (int8_t)0x80) { \ 1079 x = 0x7f; \ 1080 SET_QC(); \ 1081 } else { \ 1082 x = -x; \ 1083 }} while (0) 1084 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1085 { 1086 neon_s8 vec; 1087 NEON_UNPACK(neon_s8, vec, x); 1088 DO_QNEG8(vec.v1); 1089 DO_QNEG8(vec.v2); 1090 DO_QNEG8(vec.v3); 1091 DO_QNEG8(vec.v4); 1092 NEON_PACK(neon_s8, x, vec); 1093 return x; 1094 } 1095 #undef DO_QNEG8 1096 1097 #define DO_QABS16(x) do { \ 1098 if (x == (int16_t)0x8000) { \ 1099 x = 0x7fff; \ 1100 SET_QC(); \ 1101 } else if (x < 0) { \ 1102 x = -x; \ 1103 }} while (0) 1104 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1105 { 1106 neon_s16 vec; 1107 NEON_UNPACK(neon_s16, vec, x); 1108 DO_QABS16(vec.v1); 1109 DO_QABS16(vec.v2); 1110 NEON_PACK(neon_s16, x, vec); 1111 return x; 1112 } 1113 #undef DO_QABS16 1114 1115 #define DO_QNEG16(x) do { \ 1116 if (x == (int16_t)0x8000) { \ 1117 x = 0x7fff; \ 1118 SET_QC(); \ 1119 } else { \ 1120 x = -x; \ 1121 }} while (0) 1122 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1123 { 1124 neon_s16 vec; 1125 NEON_UNPACK(neon_s16, vec, x); 1126 DO_QNEG16(vec.v1); 1127 DO_QNEG16(vec.v2); 1128 NEON_PACK(neon_s16, x, vec); 1129 return x; 1130 } 1131 #undef DO_QNEG16 1132 1133 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1134 { 1135 if (x == SIGNBIT) { 1136 SET_QC(); 1137 x = ~SIGNBIT; 1138 } else if ((int32_t)x < 0) { 1139 x = -x; 1140 } 1141 return x; 1142 } 1143 1144 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1145 { 1146 if (x == SIGNBIT) { 1147 SET_QC(); 1148 x = ~SIGNBIT; 1149 } else { 1150 x = -x; 1151 } 1152 return x; 1153 } 1154 1155 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1156 { 1157 if (x == SIGNBIT64) { 1158 SET_QC(); 1159 x = ~SIGNBIT64; 1160 } else if ((int64_t)x < 0) { 1161 x = -x; 1162 } 1163 return x; 1164 } 1165 1166 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1167 { 1168 if (x == SIGNBIT64) { 1169 SET_QC(); 1170 x = ~SIGNBIT64; 1171 } else { 1172 x = -x; 1173 } 1174 return x; 1175 } 1176 1177 /* NEON Float helpers. */ 1178 1179 /* Floating point comparisons produce an integer result. 1180 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1181 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1182 */ 1183 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, float_status *fpst) 1184 { 1185 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1186 } 1187 1188 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, float_status *fpst) 1189 { 1190 return -float32_le(make_float32(b), make_float32(a), fpst); 1191 } 1192 1193 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, float_status *fpst) 1194 { 1195 return -float32_lt(make_float32(b), make_float32(a), fpst); 1196 } 1197 1198 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, float_status *fpst) 1199 { 1200 float32 f0 = float32_abs(make_float32(a)); 1201 float32 f1 = float32_abs(make_float32(b)); 1202 return -float32_le(f1, f0, fpst); 1203 } 1204 1205 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, float_status *fpst) 1206 { 1207 float32 f0 = float32_abs(make_float32(a)); 1208 float32 f1 = float32_abs(make_float32(b)); 1209 return -float32_lt(f1, f0, fpst); 1210 } 1211 1212 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, float_status *fpst) 1213 { 1214 float64 f0 = float64_abs(make_float64(a)); 1215 float64 f1 = float64_abs(make_float64(b)); 1216 return -float64_le(f1, f0, fpst); 1217 } 1218 1219 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, float_status *fpst) 1220 { 1221 float64 f0 = float64_abs(make_float64(a)); 1222 float64 f1 = float64_abs(make_float64(b)); 1223 return -float64_lt(f1, f0, fpst); 1224 } 1225 1226 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1227 1228 void HELPER(neon_qunzip8)(void *vd, void *vm) 1229 { 1230 uint64_t *rd = vd, *rm = vm; 1231 uint64_t zd0 = rd[0], zd1 = rd[1]; 1232 uint64_t zm0 = rm[0], zm1 = rm[1]; 1233 1234 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1235 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1236 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1237 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1238 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1239 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1240 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1241 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1242 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1243 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1244 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1245 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1246 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1247 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1248 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1249 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1250 1251 rm[0] = m0; 1252 rm[1] = m1; 1253 rd[0] = d0; 1254 rd[1] = d1; 1255 } 1256 1257 void HELPER(neon_qunzip16)(void *vd, void *vm) 1258 { 1259 uint64_t *rd = vd, *rm = vm; 1260 uint64_t zd0 = rd[0], zd1 = rd[1]; 1261 uint64_t zm0 = rm[0], zm1 = rm[1]; 1262 1263 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1264 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1265 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1266 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1267 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1268 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1269 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1270 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1271 1272 rm[0] = m0; 1273 rm[1] = m1; 1274 rd[0] = d0; 1275 rd[1] = d1; 1276 } 1277 1278 void HELPER(neon_qunzip32)(void *vd, void *vm) 1279 { 1280 uint64_t *rd = vd, *rm = vm; 1281 uint64_t zd0 = rd[0], zd1 = rd[1]; 1282 uint64_t zm0 = rm[0], zm1 = rm[1]; 1283 1284 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1285 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1286 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1287 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1288 1289 rm[0] = m0; 1290 rm[1] = m1; 1291 rd[0] = d0; 1292 rd[1] = d1; 1293 } 1294 1295 void HELPER(neon_unzip8)(void *vd, void *vm) 1296 { 1297 uint64_t *rd = vd, *rm = vm; 1298 uint64_t zd = rd[0], zm = rm[0]; 1299 1300 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1301 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1302 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1303 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1304 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1305 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1306 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1307 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1308 1309 rm[0] = m0; 1310 rd[0] = d0; 1311 } 1312 1313 void HELPER(neon_unzip16)(void *vd, void *vm) 1314 { 1315 uint64_t *rd = vd, *rm = vm; 1316 uint64_t zd = rd[0], zm = rm[0]; 1317 1318 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1319 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1320 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1321 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1322 1323 rm[0] = m0; 1324 rd[0] = d0; 1325 } 1326 1327 void HELPER(neon_qzip8)(void *vd, void *vm) 1328 { 1329 uint64_t *rd = vd, *rm = vm; 1330 uint64_t zd0 = rd[0], zd1 = rd[1]; 1331 uint64_t zm0 = rm[0], zm1 = rm[1]; 1332 1333 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1334 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1335 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1336 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1337 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1338 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1339 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1340 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1341 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1342 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1343 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1344 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1345 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1346 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1347 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1348 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1349 1350 rm[0] = m0; 1351 rm[1] = m1; 1352 rd[0] = d0; 1353 rd[1] = d1; 1354 } 1355 1356 void HELPER(neon_qzip16)(void *vd, void *vm) 1357 { 1358 uint64_t *rd = vd, *rm = vm; 1359 uint64_t zd0 = rd[0], zd1 = rd[1]; 1360 uint64_t zm0 = rm[0], zm1 = rm[1]; 1361 1362 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1363 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1364 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1365 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1366 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1367 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1368 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1369 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1370 1371 rm[0] = m0; 1372 rm[1] = m1; 1373 rd[0] = d0; 1374 rd[1] = d1; 1375 } 1376 1377 void HELPER(neon_qzip32)(void *vd, void *vm) 1378 { 1379 uint64_t *rd = vd, *rm = vm; 1380 uint64_t zd0 = rd[0], zd1 = rd[1]; 1381 uint64_t zm0 = rm[0], zm1 = rm[1]; 1382 1383 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1384 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1385 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1386 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1387 1388 rm[0] = m0; 1389 rm[1] = m1; 1390 rd[0] = d0; 1391 rd[1] = d1; 1392 } 1393 1394 void HELPER(neon_zip8)(void *vd, void *vm) 1395 { 1396 uint64_t *rd = vd, *rm = vm; 1397 uint64_t zd = rd[0], zm = rm[0]; 1398 1399 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1400 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1401 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1402 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1403 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1404 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1405 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1406 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1407 1408 rm[0] = m0; 1409 rd[0] = d0; 1410 } 1411 1412 void HELPER(neon_zip16)(void *vd, void *vm) 1413 { 1414 uint64_t *rd = vd, *rm = vm; 1415 uint64_t zd = rd[0], zm = rm[0]; 1416 1417 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1418 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1419 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1420 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1421 1422 rm[0] = m0; 1423 rd[0] = d0; 1424 } 1425