1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "cpu.h" 12 #include "tcg/tcg-gvec-desc.h" 13 #include "fpu/softfloat.h" 14 #include "vec_internal.h" 15 16 #define HELPER_H "tcg/helper.h" 17 #include "exec/helper-proto.h.inc" 18 19 #define SIGNBIT (uint32_t)0x80000000 20 #define SIGNBIT64 ((uint64_t)1 << 63) 21 22 #define SET_QC() env->vfp.qc[0] = 1 23 24 #define NEON_TYPE1(name, type) \ 25 typedef struct \ 26 { \ 27 type v1; \ 28 } neon_##name; 29 #if HOST_BIG_ENDIAN 30 #define NEON_TYPE2(name, type) \ 31 typedef struct \ 32 { \ 33 type v2; \ 34 type v1; \ 35 } neon_##name; 36 #define NEON_TYPE4(name, type) \ 37 typedef struct \ 38 { \ 39 type v4; \ 40 type v3; \ 41 type v2; \ 42 type v1; \ 43 } neon_##name; 44 #else 45 #define NEON_TYPE2(name, type) \ 46 typedef struct \ 47 { \ 48 type v1; \ 49 type v2; \ 50 } neon_##name; 51 #define NEON_TYPE4(name, type) \ 52 typedef struct \ 53 { \ 54 type v1; \ 55 type v2; \ 56 type v3; \ 57 type v4; \ 58 } neon_##name; 59 #endif 60 61 NEON_TYPE4(s8, int8_t) 62 NEON_TYPE4(u8, uint8_t) 63 NEON_TYPE2(s16, int16_t) 64 NEON_TYPE2(u16, uint16_t) 65 NEON_TYPE1(s32, int32_t) 66 NEON_TYPE1(u32, uint32_t) 67 #undef NEON_TYPE4 68 #undef NEON_TYPE2 69 #undef NEON_TYPE1 70 71 /* Copy from a uint32_t to a vector structure type. */ 72 #define NEON_UNPACK(vtype, dest, val) do { \ 73 union { \ 74 vtype v; \ 75 uint32_t i; \ 76 } conv_u; \ 77 conv_u.i = (val); \ 78 dest = conv_u.v; \ 79 } while(0) 80 81 /* Copy from a vector structure type to a uint32_t. */ 82 #define NEON_PACK(vtype, dest, val) do { \ 83 union { \ 84 vtype v; \ 85 uint32_t i; \ 86 } conv_u; \ 87 conv_u.v = (val); \ 88 dest = conv_u.i; \ 89 } while(0) 90 91 #define NEON_DO1 \ 92 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 93 #define NEON_DO2 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 96 #define NEON_DO4 \ 97 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 98 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 99 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 100 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 101 102 #define NEON_VOP_BODY(vtype, n) \ 103 { \ 104 uint32_t res; \ 105 vtype vsrc1; \ 106 vtype vsrc2; \ 107 vtype vdest; \ 108 NEON_UNPACK(vtype, vsrc1, arg1); \ 109 NEON_UNPACK(vtype, vsrc2, arg2); \ 110 NEON_DO##n; \ 111 NEON_PACK(vtype, res, vdest); \ 112 return res; \ 113 } 114 115 #define NEON_VOP(name, vtype, n) \ 116 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 117 NEON_VOP_BODY(vtype, n) 118 119 #define NEON_VOP_ENV(name, vtype, n) \ 120 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 121 NEON_VOP_BODY(vtype, n) 122 123 #define NEON_GVEC_VOP2(name, vtype) \ 124 void HELPER(name)(void *vd, void *vn, void *vm, uint32_t desc) \ 125 { \ 126 intptr_t i, opr_sz = simd_oprsz(desc); \ 127 vtype *d = vd, *n = vn, *m = vm; \ 128 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 129 NEON_FN(d[i], n[i], m[i]); \ 130 } \ 131 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 132 } 133 134 #define NEON_GVEC_VOP2_ENV(name, vtype) \ 135 void HELPER(name)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) \ 136 { \ 137 intptr_t i, opr_sz = simd_oprsz(desc); \ 138 vtype *d = vd, *n = vn, *m = vm; \ 139 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 140 NEON_FN(d[i], n[i], m[i]); \ 141 } \ 142 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 143 } 144 145 #define NEON_GVEC_VOP2i_ENV(name, vtype) \ 146 void HELPER(name)(void *vd, void *vn, CPUARMState *env, uint32_t desc) \ 147 { \ 148 intptr_t i, opr_sz = simd_oprsz(desc); \ 149 int imm = simd_data(desc); \ 150 vtype *d = vd, *n = vn; \ 151 for (i = 0; i < opr_sz / sizeof(vtype); i++) { \ 152 NEON_FN(d[i], n[i], imm); \ 153 } \ 154 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 155 } 156 157 /* Pairwise operations. */ 158 /* For 32-bit elements each segment only contains a single element, so 159 the elementwise and pairwise operations are the same. */ 160 #define NEON_PDO2 \ 161 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 162 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 163 #define NEON_PDO4 \ 164 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 165 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 166 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 167 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 168 169 #define NEON_POP(name, vtype, n) \ 170 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 171 { \ 172 uint32_t res; \ 173 vtype vsrc1; \ 174 vtype vsrc2; \ 175 vtype vdest; \ 176 NEON_UNPACK(vtype, vsrc1, arg1); \ 177 NEON_UNPACK(vtype, vsrc2, arg2); \ 178 NEON_PDO##n; \ 179 NEON_PACK(vtype, res, vdest); \ 180 return res; \ 181 } 182 183 /* Unary operators. */ 184 #define NEON_VOP1(name, vtype, n) \ 185 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 186 { \ 187 vtype vsrc1; \ 188 vtype vdest; \ 189 NEON_UNPACK(vtype, vsrc1, arg); \ 190 NEON_DO##n; \ 191 NEON_PACK(vtype, arg, vdest); \ 192 return arg; \ 193 } 194 195 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 196 NEON_POP(pmin_s8, neon_s8, 4) 197 NEON_POP(pmin_u8, neon_u8, 4) 198 NEON_POP(pmin_s16, neon_s16, 2) 199 NEON_POP(pmin_u16, neon_u16, 2) 200 #undef NEON_FN 201 202 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 203 NEON_POP(pmax_s8, neon_s8, 4) 204 NEON_POP(pmax_u8, neon_u8, 4) 205 NEON_POP(pmax_s16, neon_s16, 2) 206 NEON_POP(pmax_u16, neon_u16, 2) 207 #undef NEON_FN 208 209 #define NEON_FN(dest, src1, src2) \ 210 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 211 NEON_VOP(shl_u16, neon_u16, 2) 212 #undef NEON_FN 213 214 #define NEON_FN(dest, src1, src2) \ 215 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) 216 NEON_VOP(shl_s16, neon_s16, 2) 217 #undef NEON_FN 218 219 #define NEON_FN(dest, src1, src2) \ 220 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 221 NEON_VOP(rshl_s8, neon_s8, 4) 222 NEON_GVEC_VOP2(gvec_srshl_b, int8_t) 223 #undef NEON_FN 224 225 #define NEON_FN(dest, src1, src2) \ 226 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 227 NEON_VOP(rshl_s16, neon_s16, 2) 228 NEON_GVEC_VOP2(gvec_srshl_h, int16_t) 229 #undef NEON_FN 230 231 #define NEON_FN(dest, src1, src2) \ 232 (dest = do_sqrshl_bhs(src1, src2, 16, true, NULL)) 233 NEON_GVEC_VOP2(sme2_srshl_h, int16_t) 234 #undef NEON_FN 235 236 #define NEON_FN(dest, src1, src2) \ 237 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 238 NEON_GVEC_VOP2(gvec_srshl_s, int32_t) 239 #undef NEON_FN 240 241 #define NEON_FN(dest, src1, src2) \ 242 (dest = do_sqrshl_bhs(src1, src2, 32, true, NULL)) 243 NEON_GVEC_VOP2(sme2_srshl_s, int32_t) 244 #undef NEON_FN 245 246 #define NEON_FN(dest, src1, src2) \ 247 (dest = do_sqrshl_d(src1, (int8_t)src2, true, NULL)) 248 NEON_GVEC_VOP2(gvec_srshl_d, int64_t) 249 #undef NEON_FN 250 251 #define NEON_FN(dest, src1, src2) \ 252 (dest = do_sqrshl_d(src1, src2, true, NULL)) 253 NEON_GVEC_VOP2(sme2_srshl_d, int64_t) 254 #undef NEON_FN 255 256 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) 257 { 258 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 259 } 260 261 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) 262 { 263 return do_sqrshl_d(val, (int8_t)shift, true, NULL); 264 } 265 266 #define NEON_FN(dest, src1, src2) \ 267 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) 268 NEON_VOP(rshl_u8, neon_u8, 4) 269 NEON_GVEC_VOP2(gvec_urshl_b, uint8_t) 270 #undef NEON_FN 271 272 #define NEON_FN(dest, src1, src2) \ 273 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) 274 NEON_VOP(rshl_u16, neon_u16, 2) 275 NEON_GVEC_VOP2(gvec_urshl_h, uint16_t) 276 #undef NEON_FN 277 278 #define NEON_FN(dest, src1, src2) \ 279 (dest = do_uqrshl_bhs(src1, (int16_t)src2, 16, true, NULL)) 280 NEON_GVEC_VOP2(sme2_urshl_h, uint16_t) 281 #undef NEON_FN 282 283 #define NEON_FN(dest, src1, src2) \ 284 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, NULL)) 285 NEON_GVEC_VOP2(gvec_urshl_s, int32_t) 286 #undef NEON_FN 287 288 #define NEON_FN(dest, src1, src2) \ 289 (dest = do_uqrshl_bhs(src1, src2, 32, true, NULL)) 290 NEON_GVEC_VOP2(sme2_urshl_s, int32_t) 291 #undef NEON_FN 292 293 #define NEON_FN(dest, src1, src2) \ 294 (dest = do_uqrshl_d(src1, (int8_t)src2, true, NULL)) 295 NEON_GVEC_VOP2(gvec_urshl_d, int64_t) 296 #undef NEON_FN 297 298 #define NEON_FN(dest, src1, src2) \ 299 (dest = do_uqrshl_d(src1, src2, true, NULL)) 300 NEON_GVEC_VOP2(sme2_urshl_d, int64_t) 301 #undef NEON_FN 302 303 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) 304 { 305 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); 306 } 307 308 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) 309 { 310 return do_uqrshl_d(val, (int8_t)shift, true, NULL); 311 } 312 313 #define NEON_FN(dest, src1, src2) \ 314 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 315 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 316 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t) 317 NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t) 318 #undef NEON_FN 319 320 #define NEON_FN(dest, src1, src2) \ 321 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 322 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 323 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t) 324 NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t) 325 #undef NEON_FN 326 327 #define NEON_FN(dest, src1, src2) \ 328 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 329 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t) 330 NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t) 331 #undef NEON_FN 332 333 #define NEON_FN(dest, src1, src2) \ 334 (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 335 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t) 336 NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t) 337 #undef NEON_FN 338 339 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 340 { 341 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 342 } 343 344 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 345 { 346 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 347 } 348 349 #define NEON_FN(dest, src1, src2) \ 350 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 351 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 352 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t) 353 NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t) 354 #undef NEON_FN 355 356 #define NEON_FN(dest, src1, src2) \ 357 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 358 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 359 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t) 360 NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t) 361 #undef NEON_FN 362 363 #define NEON_FN(dest, src1, src2) \ 364 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 365 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t) 366 NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t) 367 #undef NEON_FN 368 369 #define NEON_FN(dest, src1, src2) \ 370 (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 371 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t) 372 NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t) 373 #undef NEON_FN 374 375 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 376 { 377 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 378 } 379 380 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 381 { 382 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 383 } 384 385 #define NEON_FN(dest, src1, src2) \ 386 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) 387 NEON_VOP_ENV(qshlu_s8, neon_s8, 4) 388 NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t) 389 #undef NEON_FN 390 391 #define NEON_FN(dest, src1, src2) \ 392 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) 393 NEON_VOP_ENV(qshlu_s16, neon_s16, 2) 394 NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t) 395 #undef NEON_FN 396 397 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 398 { 399 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); 400 } 401 402 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 403 { 404 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); 405 } 406 407 #define NEON_FN(dest, src1, src2) \ 408 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc)) 409 NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t) 410 #undef NEON_FN 411 412 #define NEON_FN(dest, src1, src2) \ 413 (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc)) 414 NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t) 415 #undef NEON_FN 416 417 #define NEON_FN(dest, src1, src2) \ 418 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 419 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 420 NEON_GVEC_VOP2_ENV(neon_uqrshl_b, uint8_t) 421 #undef NEON_FN 422 423 #define NEON_FN(dest, src1, src2) \ 424 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 425 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 426 NEON_GVEC_VOP2_ENV(neon_uqrshl_h, uint16_t) 427 #undef NEON_FN 428 429 #define NEON_FN(dest, src1, src2) \ 430 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 431 NEON_GVEC_VOP2_ENV(neon_uqrshl_s, uint32_t) 432 #undef NEON_FN 433 434 #define NEON_FN(dest, src1, src2) \ 435 (dest = do_uqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 436 NEON_GVEC_VOP2_ENV(neon_uqrshl_d, uint64_t) 437 #undef NEON_FN 438 439 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) 440 { 441 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 442 } 443 444 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) 445 { 446 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 447 } 448 449 #define NEON_FN(dest, src1, src2) \ 450 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) 451 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 452 NEON_GVEC_VOP2_ENV(neon_sqrshl_b, int8_t) 453 #undef NEON_FN 454 455 #define NEON_FN(dest, src1, src2) \ 456 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) 457 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 458 NEON_GVEC_VOP2_ENV(neon_sqrshl_h, int16_t) 459 #undef NEON_FN 460 461 #define NEON_FN(dest, src1, src2) \ 462 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, true, env->vfp.qc)) 463 NEON_GVEC_VOP2_ENV(neon_sqrshl_s, int32_t) 464 #undef NEON_FN 465 466 #define NEON_FN(dest, src1, src2) \ 467 (dest = do_sqrshl_d(src1, (int8_t)src2, true, env->vfp.qc)) 468 NEON_GVEC_VOP2_ENV(neon_sqrshl_d, int64_t) 469 #undef NEON_FN 470 471 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) 472 { 473 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); 474 } 475 476 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) 477 { 478 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); 479 } 480 481 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 482 { 483 uint32_t mask; 484 mask = (a ^ b) & 0x80808080u; 485 a &= ~0x80808080u; 486 b &= ~0x80808080u; 487 return (a + b) ^ mask; 488 } 489 490 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 491 { 492 uint32_t mask; 493 mask = (a ^ b) & 0x80008000u; 494 a &= ~0x80008000u; 495 b &= ~0x80008000u; 496 return (a + b) ^ mask; 497 } 498 499 #define NEON_FN(dest, src1, src2) dest = src1 - src2 500 NEON_VOP(sub_u8, neon_u8, 4) 501 NEON_VOP(sub_u16, neon_u16, 2) 502 #undef NEON_FN 503 504 #define NEON_FN(dest, src1, src2) dest = src1 * src2 505 NEON_VOP(mul_u8, neon_u8, 4) 506 NEON_VOP(mul_u16, neon_u16, 2) 507 #undef NEON_FN 508 509 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 510 NEON_VOP(tst_u8, neon_u8, 4) 511 NEON_VOP(tst_u16, neon_u16, 2) 512 NEON_VOP(tst_u32, neon_u32, 1) 513 #undef NEON_FN 514 515 /* Count Leading Sign/Zero Bits. */ 516 static inline int do_clz8(uint8_t x) 517 { 518 int n; 519 for (n = 8; x; n--) 520 x >>= 1; 521 return n; 522 } 523 524 static inline int do_clz16(uint16_t x) 525 { 526 int n; 527 for (n = 16; x; n--) 528 x >>= 1; 529 return n; 530 } 531 532 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 533 NEON_VOP1(clz_u8, neon_u8, 4) 534 #undef NEON_FN 535 536 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 537 NEON_VOP1(clz_u16, neon_u16, 2) 538 #undef NEON_FN 539 540 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 541 NEON_VOP1(cls_s8, neon_s8, 4) 542 #undef NEON_FN 543 544 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 545 NEON_VOP1(cls_s16, neon_s16, 2) 546 #undef NEON_FN 547 548 uint32_t HELPER(neon_cls_s32)(uint32_t x) 549 { 550 int count; 551 if ((int32_t)x < 0) 552 x = ~x; 553 for (count = 32; x; count--) 554 x = x >> 1; 555 return count - 1; 556 } 557 558 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 559 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 560 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 561 SET_QC(); \ 562 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 563 } else { \ 564 tmp <<= 1; \ 565 } \ 566 if (round) { \ 567 int32_t old = tmp; \ 568 tmp += 1 << 15; \ 569 if ((int32_t)tmp < old) { \ 570 SET_QC(); \ 571 tmp = SIGNBIT - 1; \ 572 } \ 573 } \ 574 dest = tmp >> 16; \ 575 } while(0) 576 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 577 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 578 #undef NEON_FN 579 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 580 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 581 #undef NEON_FN 582 #undef NEON_QDMULH16 583 584 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 585 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 586 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 587 SET_QC(); \ 588 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 589 } else { \ 590 tmp <<= 1; \ 591 } \ 592 if (round) { \ 593 int64_t old = tmp; \ 594 tmp += (int64_t)1 << 31; \ 595 if ((int64_t)tmp < old) { \ 596 SET_QC(); \ 597 tmp = SIGNBIT64 - 1; \ 598 } \ 599 } \ 600 dest = tmp >> 32; \ 601 } while(0) 602 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 603 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 604 #undef NEON_FN 605 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 606 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 607 #undef NEON_FN 608 #undef NEON_QDMULH32 609 610 /* Only the low 32-bits of output are significant. */ 611 uint64_t HELPER(neon_narrow_u8)(uint64_t x) 612 { 613 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 614 | ((x >> 24) & 0xff000000u); 615 } 616 617 /* Only the low 32-bits of output are significant. */ 618 uint64_t HELPER(neon_narrow_u16)(uint64_t x) 619 { 620 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 621 } 622 623 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 624 { 625 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 626 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 627 } 628 629 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 630 { 631 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 632 } 633 634 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 635 { 636 x &= 0xff80ff80ff80ff80ull; 637 x += 0x0080008000800080ull; 638 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 639 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 640 } 641 642 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 643 { 644 x &= 0xffff8000ffff8000ull; 645 x += 0x0000800000008000ull; 646 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 647 } 648 649 /* Only the low 32-bits of output are significant. */ 650 uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 651 { 652 uint16_t s; 653 uint8_t d; 654 uint32_t res = 0; 655 #define SAT8(n) \ 656 s = x >> n; \ 657 if (s & 0x8000) { \ 658 SET_QC(); \ 659 } else { \ 660 if (s > 0xff) { \ 661 d = 0xff; \ 662 SET_QC(); \ 663 } else { \ 664 d = s; \ 665 } \ 666 res |= (uint32_t)d << (n / 2); \ 667 } 668 669 SAT8(0); 670 SAT8(16); 671 SAT8(32); 672 SAT8(48); 673 #undef SAT8 674 return res; 675 } 676 677 /* Only the low 32-bits of output are significant. */ 678 uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 679 { 680 uint16_t s; 681 uint8_t d; 682 uint32_t res = 0; 683 #define SAT8(n) \ 684 s = x >> n; \ 685 if (s > 0xff) { \ 686 d = 0xff; \ 687 SET_QC(); \ 688 } else { \ 689 d = s; \ 690 } \ 691 res |= (uint32_t)d << (n / 2); 692 693 SAT8(0); 694 SAT8(16); 695 SAT8(32); 696 SAT8(48); 697 #undef SAT8 698 return res; 699 } 700 701 /* Only the low 32-bits of output are significant. */ 702 uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 703 { 704 int16_t s; 705 uint8_t d; 706 uint32_t res = 0; 707 #define SAT8(n) \ 708 s = x >> n; \ 709 if (s != (int8_t)s) { \ 710 d = (s >> 15) ^ 0x7f; \ 711 SET_QC(); \ 712 } else { \ 713 d = s; \ 714 } \ 715 res |= (uint32_t)d << (n / 2); 716 717 SAT8(0); 718 SAT8(16); 719 SAT8(32); 720 SAT8(48); 721 #undef SAT8 722 return res; 723 } 724 725 /* Only the low 32-bits of output are significant. */ 726 uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 727 { 728 uint32_t high; 729 uint32_t low; 730 low = x; 731 if (low & 0x80000000) { 732 low = 0; 733 SET_QC(); 734 } else if (low > 0xffff) { 735 low = 0xffff; 736 SET_QC(); 737 } 738 high = x >> 32; 739 if (high & 0x80000000) { 740 high = 0; 741 SET_QC(); 742 } else if (high > 0xffff) { 743 high = 0xffff; 744 SET_QC(); 745 } 746 return deposit32(low, 16, 16, high); 747 } 748 749 /* Only the low 32-bits of output are significant. */ 750 uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 751 { 752 uint32_t high; 753 uint32_t low; 754 low = x; 755 if (low > 0xffff) { 756 low = 0xffff; 757 SET_QC(); 758 } 759 high = x >> 32; 760 if (high > 0xffff) { 761 high = 0xffff; 762 SET_QC(); 763 } 764 return deposit32(low, 16, 16, high); 765 } 766 767 /* Only the low 32-bits of output are significant. */ 768 uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 769 { 770 int32_t low; 771 int32_t high; 772 low = x; 773 if (low != (int16_t)low) { 774 low = (low >> 31) ^ 0x7fff; 775 SET_QC(); 776 } 777 high = x >> 32; 778 if (high != (int16_t)high) { 779 high = (high >> 31) ^ 0x7fff; 780 SET_QC(); 781 } 782 return deposit32(low, 16, 16, high); 783 } 784 785 /* Only the low 32-bits of output are significant. */ 786 uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 787 { 788 if (x & 0x8000000000000000ull) { 789 SET_QC(); 790 return 0; 791 } 792 if (x > 0xffffffffu) { 793 SET_QC(); 794 return 0xffffffffu; 795 } 796 return x; 797 } 798 799 /* Only the low 32-bits of output are significant. */ 800 uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 801 { 802 if (x > 0xffffffffu) { 803 SET_QC(); 804 return 0xffffffffu; 805 } 806 return x; 807 } 808 809 /* Only the low 32-bits of output are significant. */ 810 uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 811 { 812 if ((int64_t)x != (int32_t)x) { 813 SET_QC(); 814 return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff; 815 } 816 return (uint32_t)x; 817 } 818 819 uint64_t HELPER(neon_widen_u8)(uint32_t x) 820 { 821 uint64_t tmp; 822 uint64_t ret; 823 ret = (uint8_t)x; 824 tmp = (uint8_t)(x >> 8); 825 ret |= tmp << 16; 826 tmp = (uint8_t)(x >> 16); 827 ret |= tmp << 32; 828 tmp = (uint8_t)(x >> 24); 829 ret |= tmp << 48; 830 return ret; 831 } 832 833 uint64_t HELPER(neon_widen_s8)(uint32_t x) 834 { 835 uint64_t tmp; 836 uint64_t ret; 837 ret = (uint16_t)(int8_t)x; 838 tmp = (uint16_t)(int8_t)(x >> 8); 839 ret |= tmp << 16; 840 tmp = (uint16_t)(int8_t)(x >> 16); 841 ret |= tmp << 32; 842 tmp = (uint16_t)(int8_t)(x >> 24); 843 ret |= tmp << 48; 844 return ret; 845 } 846 847 uint64_t HELPER(neon_widen_u16)(uint32_t x) 848 { 849 uint64_t high = (uint16_t)(x >> 16); 850 return ((uint16_t)x) | (high << 32); 851 } 852 853 uint64_t HELPER(neon_widen_s16)(uint32_t x) 854 { 855 uint64_t high = (int16_t)(x >> 16); 856 return ((uint32_t)(int16_t)x) | (high << 32); 857 } 858 859 /* Pairwise long add: add pairs of adjacent elements into 860 * double-width elements in the result (eg _s8 is an 8x8->16 op) 861 */ 862 uint64_t HELPER(neon_addlp_s8)(uint64_t a) 863 { 864 uint64_t nsignmask = 0x0080008000800080ULL; 865 uint64_t wsignmask = 0x8000800080008000ULL; 866 uint64_t elementmask = 0x00ff00ff00ff00ffULL; 867 uint64_t tmp1, tmp2; 868 uint64_t res, signres; 869 870 /* Extract odd elements, sign extend each to a 16 bit field */ 871 tmp1 = a & elementmask; 872 tmp1 ^= nsignmask; 873 tmp1 |= wsignmask; 874 tmp1 = (tmp1 - nsignmask) ^ wsignmask; 875 /* Ditto for the even elements */ 876 tmp2 = (a >> 8) & elementmask; 877 tmp2 ^= nsignmask; 878 tmp2 |= wsignmask; 879 tmp2 = (tmp2 - nsignmask) ^ wsignmask; 880 881 /* calculate the result by summing bits 0..14, 16..22, etc, 882 * and then adjusting the sign bits 15, 23, etc manually. 883 * This ensures the addition can't overflow the 16 bit field. 884 */ 885 signres = (tmp1 ^ tmp2) & wsignmask; 886 res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask); 887 res ^= signres; 888 889 return res; 890 } 891 892 uint64_t HELPER(neon_addlp_s16)(uint64_t a) 893 { 894 int32_t reslo, reshi; 895 896 reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16); 897 reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48); 898 899 return (uint32_t)reslo | (((uint64_t)reshi) << 32); 900 } 901 902 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 903 { 904 uint32_t x, y; 905 uint32_t low, high; 906 907 x = a; 908 y = b; 909 low = x + y; 910 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 911 SET_QC(); 912 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 913 } 914 x = a >> 32; 915 y = b >> 32; 916 high = x + y; 917 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 918 SET_QC(); 919 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 920 } 921 return low | ((uint64_t)high << 32); 922 } 923 924 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 925 { 926 uint64_t result; 927 928 result = a + b; 929 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 930 SET_QC(); 931 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 932 } 933 return result; 934 } 935 936 /* We have to do the arithmetic in a larger type than 937 * the input type, because for example with a signed 32 bit 938 * op the absolute difference can overflow a signed 32 bit value. 939 */ 940 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 941 arithtype tmp_x = (intype)(x); \ 942 arithtype tmp_y = (intype)(y); \ 943 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 944 } while(0) 945 946 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 947 { 948 uint64_t tmp; 949 uint64_t result; 950 DO_ABD(result, a, b, uint8_t, uint32_t); 951 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 952 result |= tmp << 16; 953 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 954 result |= tmp << 32; 955 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 956 result |= tmp << 48; 957 return result; 958 } 959 960 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 961 { 962 uint64_t tmp; 963 uint64_t result; 964 DO_ABD(result, a, b, int8_t, int32_t); 965 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 966 result |= tmp << 16; 967 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 968 result |= tmp << 32; 969 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 970 result |= tmp << 48; 971 return result; 972 } 973 974 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 975 { 976 uint64_t tmp; 977 uint64_t result; 978 DO_ABD(result, a, b, uint16_t, uint32_t); 979 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 980 return result | (tmp << 32); 981 } 982 983 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 984 { 985 uint64_t tmp; 986 uint64_t result; 987 DO_ABD(result, a, b, int16_t, int32_t); 988 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 989 return result | (tmp << 32); 990 } 991 992 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 993 { 994 uint64_t result; 995 DO_ABD(result, a, b, uint32_t, uint64_t); 996 return result; 997 } 998 999 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1000 { 1001 uint64_t result; 1002 DO_ABD(result, a, b, int32_t, int64_t); 1003 return result; 1004 } 1005 #undef DO_ABD 1006 1007 /* Widening multiply. Named type is the source type. */ 1008 #define DO_MULL(dest, x, y, type1, type2) do { \ 1009 type1 tmp_x = x; \ 1010 type1 tmp_y = y; \ 1011 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1012 } while(0) 1013 1014 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1015 { 1016 uint64_t tmp; 1017 uint64_t result; 1018 1019 DO_MULL(result, a, b, uint8_t, uint16_t); 1020 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1021 result |= tmp << 16; 1022 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1023 result |= tmp << 32; 1024 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1025 result |= tmp << 48; 1026 return result; 1027 } 1028 1029 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1030 { 1031 uint64_t tmp; 1032 uint64_t result; 1033 1034 DO_MULL(result, a, b, int8_t, uint16_t); 1035 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1036 result |= tmp << 16; 1037 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1038 result |= tmp << 32; 1039 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1040 result |= tmp << 48; 1041 return result; 1042 } 1043 1044 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1045 { 1046 uint64_t tmp; 1047 uint64_t result; 1048 1049 DO_MULL(result, a, b, uint16_t, uint32_t); 1050 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1051 return result | (tmp << 32); 1052 } 1053 1054 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1055 { 1056 uint64_t tmp; 1057 uint64_t result; 1058 1059 DO_MULL(result, a, b, int16_t, uint32_t); 1060 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1061 return result | (tmp << 32); 1062 } 1063 1064 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1065 { 1066 uint16_t tmp; 1067 uint64_t result; 1068 result = (uint16_t)-x; 1069 tmp = -(x >> 16); 1070 result |= (uint64_t)tmp << 16; 1071 tmp = -(x >> 32); 1072 result |= (uint64_t)tmp << 32; 1073 tmp = -(x >> 48); 1074 result |= (uint64_t)tmp << 48; 1075 return result; 1076 } 1077 1078 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1079 { 1080 uint32_t low = -x; 1081 uint32_t high = -(x >> 32); 1082 return low | ((uint64_t)high << 32); 1083 } 1084 1085 /* Saturating sign manipulation. */ 1086 /* ??? Make these use NEON_VOP1 */ 1087 #define DO_QABS8(x) do { \ 1088 if (x == (int8_t)0x80) { \ 1089 x = 0x7f; \ 1090 SET_QC(); \ 1091 } else if (x < 0) { \ 1092 x = -x; \ 1093 }} while (0) 1094 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1095 { 1096 neon_s8 vec; 1097 NEON_UNPACK(neon_s8, vec, x); 1098 DO_QABS8(vec.v1); 1099 DO_QABS8(vec.v2); 1100 DO_QABS8(vec.v3); 1101 DO_QABS8(vec.v4); 1102 NEON_PACK(neon_s8, x, vec); 1103 return x; 1104 } 1105 #undef DO_QABS8 1106 1107 #define DO_QNEG8(x) do { \ 1108 if (x == (int8_t)0x80) { \ 1109 x = 0x7f; \ 1110 SET_QC(); \ 1111 } else { \ 1112 x = -x; \ 1113 }} while (0) 1114 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1115 { 1116 neon_s8 vec; 1117 NEON_UNPACK(neon_s8, vec, x); 1118 DO_QNEG8(vec.v1); 1119 DO_QNEG8(vec.v2); 1120 DO_QNEG8(vec.v3); 1121 DO_QNEG8(vec.v4); 1122 NEON_PACK(neon_s8, x, vec); 1123 return x; 1124 } 1125 #undef DO_QNEG8 1126 1127 #define DO_QABS16(x) do { \ 1128 if (x == (int16_t)0x8000) { \ 1129 x = 0x7fff; \ 1130 SET_QC(); \ 1131 } else if (x < 0) { \ 1132 x = -x; \ 1133 }} while (0) 1134 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1135 { 1136 neon_s16 vec; 1137 NEON_UNPACK(neon_s16, vec, x); 1138 DO_QABS16(vec.v1); 1139 DO_QABS16(vec.v2); 1140 NEON_PACK(neon_s16, x, vec); 1141 return x; 1142 } 1143 #undef DO_QABS16 1144 1145 #define DO_QNEG16(x) do { \ 1146 if (x == (int16_t)0x8000) { \ 1147 x = 0x7fff; \ 1148 SET_QC(); \ 1149 } else { \ 1150 x = -x; \ 1151 }} while (0) 1152 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1153 { 1154 neon_s16 vec; 1155 NEON_UNPACK(neon_s16, vec, x); 1156 DO_QNEG16(vec.v1); 1157 DO_QNEG16(vec.v2); 1158 NEON_PACK(neon_s16, x, vec); 1159 return x; 1160 } 1161 #undef DO_QNEG16 1162 1163 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1164 { 1165 if (x == SIGNBIT) { 1166 SET_QC(); 1167 x = ~SIGNBIT; 1168 } else if ((int32_t)x < 0) { 1169 x = -x; 1170 } 1171 return x; 1172 } 1173 1174 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1175 { 1176 if (x == SIGNBIT) { 1177 SET_QC(); 1178 x = ~SIGNBIT; 1179 } else { 1180 x = -x; 1181 } 1182 return x; 1183 } 1184 1185 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x) 1186 { 1187 if (x == SIGNBIT64) { 1188 SET_QC(); 1189 x = ~SIGNBIT64; 1190 } else if ((int64_t)x < 0) { 1191 x = -x; 1192 } 1193 return x; 1194 } 1195 1196 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x) 1197 { 1198 if (x == SIGNBIT64) { 1199 SET_QC(); 1200 x = ~SIGNBIT64; 1201 } else { 1202 x = -x; 1203 } 1204 return x; 1205 } 1206 1207 /* NEON Float helpers. */ 1208 1209 /* Floating point comparisons produce an integer result. 1210 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1211 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1212 */ 1213 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, float_status *fpst) 1214 { 1215 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1216 } 1217 1218 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, float_status *fpst) 1219 { 1220 return -float32_le(make_float32(b), make_float32(a), fpst); 1221 } 1222 1223 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, float_status *fpst) 1224 { 1225 return -float32_lt(make_float32(b), make_float32(a), fpst); 1226 } 1227 1228 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, float_status *fpst) 1229 { 1230 float32 f0 = float32_abs(make_float32(a)); 1231 float32 f1 = float32_abs(make_float32(b)); 1232 return -float32_le(f1, f0, fpst); 1233 } 1234 1235 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, float_status *fpst) 1236 { 1237 float32 f0 = float32_abs(make_float32(a)); 1238 float32 f1 = float32_abs(make_float32(b)); 1239 return -float32_lt(f1, f0, fpst); 1240 } 1241 1242 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, float_status *fpst) 1243 { 1244 float64 f0 = float64_abs(make_float64(a)); 1245 float64 f1 = float64_abs(make_float64(b)); 1246 return -float64_le(f1, f0, fpst); 1247 } 1248 1249 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, float_status *fpst) 1250 { 1251 float64 f0 = float64_abs(make_float64(a)); 1252 float64 f1 = float64_abs(make_float64(b)); 1253 return -float64_lt(f1, f0, fpst); 1254 } 1255 1256 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1257 1258 void HELPER(neon_qunzip8)(void *vd, void *vm) 1259 { 1260 uint64_t *rd = vd, *rm = vm; 1261 uint64_t zd0 = rd[0], zd1 = rd[1]; 1262 uint64_t zm0 = rm[0], zm1 = rm[1]; 1263 1264 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1265 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1266 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1267 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1268 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1269 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1270 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1271 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1272 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1273 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1274 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1275 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1276 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1277 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1278 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1279 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1280 1281 rm[0] = m0; 1282 rm[1] = m1; 1283 rd[0] = d0; 1284 rd[1] = d1; 1285 } 1286 1287 void HELPER(neon_qunzip16)(void *vd, void *vm) 1288 { 1289 uint64_t *rd = vd, *rm = vm; 1290 uint64_t zd0 = rd[0], zd1 = rd[1]; 1291 uint64_t zm0 = rm[0], zm1 = rm[1]; 1292 1293 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1294 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1295 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1296 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1297 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1298 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1299 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1300 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1301 1302 rm[0] = m0; 1303 rm[1] = m1; 1304 rd[0] = d0; 1305 rd[1] = d1; 1306 } 1307 1308 void HELPER(neon_qunzip32)(void *vd, void *vm) 1309 { 1310 uint64_t *rd = vd, *rm = vm; 1311 uint64_t zd0 = rd[0], zd1 = rd[1]; 1312 uint64_t zm0 = rm[0], zm1 = rm[1]; 1313 1314 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1315 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1316 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1317 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1318 1319 rm[0] = m0; 1320 rm[1] = m1; 1321 rd[0] = d0; 1322 rd[1] = d1; 1323 } 1324 1325 void HELPER(neon_unzip8)(void *vd, void *vm) 1326 { 1327 uint64_t *rd = vd, *rm = vm; 1328 uint64_t zd = rd[0], zm = rm[0]; 1329 1330 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1331 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1332 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1333 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1334 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1335 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1336 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1337 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1338 1339 rm[0] = m0; 1340 rd[0] = d0; 1341 } 1342 1343 void HELPER(neon_unzip16)(void *vd, void *vm) 1344 { 1345 uint64_t *rd = vd, *rm = vm; 1346 uint64_t zd = rd[0], zm = rm[0]; 1347 1348 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1349 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1350 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1351 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1352 1353 rm[0] = m0; 1354 rd[0] = d0; 1355 } 1356 1357 void HELPER(neon_qzip8)(void *vd, void *vm) 1358 { 1359 uint64_t *rd = vd, *rm = vm; 1360 uint64_t zd0 = rd[0], zd1 = rd[1]; 1361 uint64_t zm0 = rm[0], zm1 = rm[1]; 1362 1363 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1364 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1365 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1366 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1367 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1368 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1369 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1370 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1371 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1372 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1373 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1374 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1375 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1376 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1377 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1378 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1379 1380 rm[0] = m0; 1381 rm[1] = m1; 1382 rd[0] = d0; 1383 rd[1] = d1; 1384 } 1385 1386 void HELPER(neon_qzip16)(void *vd, void *vm) 1387 { 1388 uint64_t *rd = vd, *rm = vm; 1389 uint64_t zd0 = rd[0], zd1 = rd[1]; 1390 uint64_t zm0 = rm[0], zm1 = rm[1]; 1391 1392 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1393 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1394 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1395 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1396 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1397 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1398 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1399 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1400 1401 rm[0] = m0; 1402 rm[1] = m1; 1403 rd[0] = d0; 1404 rd[1] = d1; 1405 } 1406 1407 void HELPER(neon_qzip32)(void *vd, void *vm) 1408 { 1409 uint64_t *rd = vd, *rm = vm; 1410 uint64_t zd0 = rd[0], zd1 = rd[1]; 1411 uint64_t zm0 = rm[0], zm1 = rm[1]; 1412 1413 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1414 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1415 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1416 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1417 1418 rm[0] = m0; 1419 rm[1] = m1; 1420 rd[0] = d0; 1421 rd[1] = d1; 1422 } 1423 1424 void HELPER(neon_zip8)(void *vd, void *vm) 1425 { 1426 uint64_t *rd = vd, *rm = vm; 1427 uint64_t zd = rd[0], zm = rm[0]; 1428 1429 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1430 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1431 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1432 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1433 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1434 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 1435 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 1436 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1437 1438 rm[0] = m0; 1439 rd[0] = d0; 1440 } 1441 1442 void HELPER(neon_zip16)(void *vd, void *vm) 1443 { 1444 uint64_t *rd = vd, *rm = vm; 1445 uint64_t zd = rd[0], zm = rm[0]; 1446 1447 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 1448 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 1449 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 1450 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1451 1452 rm[0] = m0; 1453 rd[0] = d0; 1454 } 1455