1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 26 27 /* Note that vector data is stored in host-endian 64-bit chunks, 28 so addressing units smaller than that needs a host-endian fixup. */ 29 #ifdef HOST_WORDS_BIGENDIAN 30 #define H1(x) ((x) ^ 7) 31 #define H2(x) ((x) ^ 3) 32 #define H4(x) ((x) ^ 1) 33 #else 34 #define H1(x) (x) 35 #define H2(x) (x) 36 #define H4(x) (x) 37 #endif 38 39 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q 40 41 static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 42 { 43 uint64_t *d = vd + opr_sz; 44 uintptr_t i; 45 46 for (i = opr_sz; i < max_sz; i += 8) { 47 *d++ = 0; 48 } 49 } 50 51 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 52 static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1, 53 int16_t src2, int16_t src3) 54 { 55 /* Simplify: 56 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 57 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 58 */ 59 int32_t ret = (int32_t)src1 * src2; 60 ret = ((int32_t)src3 << 15) + ret + (1 << 14); 61 ret >>= 15; 62 if (ret != (int16_t)ret) { 63 SET_QC(); 64 ret = (ret < 0 ? -0x8000 : 0x7fff); 65 } 66 return ret; 67 } 68 69 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 70 uint32_t src2, uint32_t src3) 71 { 72 uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3); 73 uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 74 return deposit32(e1, 16, 16, e2); 75 } 76 77 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 78 void *ve, uint32_t desc) 79 { 80 uintptr_t opr_sz = simd_oprsz(desc); 81 int16_t *d = vd; 82 int16_t *n = vn; 83 int16_t *m = vm; 84 CPUARMState *env = ve; 85 uintptr_t i; 86 87 for (i = 0; i < opr_sz / 2; ++i) { 88 d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]); 89 } 90 clear_tail(d, opr_sz, simd_maxsz(desc)); 91 } 92 93 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ 94 static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1, 95 int16_t src2, int16_t src3) 96 { 97 /* Similarly, using subtraction: 98 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 99 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 100 */ 101 int32_t ret = (int32_t)src1 * src2; 102 ret = ((int32_t)src3 << 15) - ret + (1 << 14); 103 ret >>= 15; 104 if (ret != (int16_t)ret) { 105 SET_QC(); 106 ret = (ret < 0 ? -0x8000 : 0x7fff); 107 } 108 return ret; 109 } 110 111 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 112 uint32_t src2, uint32_t src3) 113 { 114 uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3); 115 uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 116 return deposit32(e1, 16, 16, e2); 117 } 118 119 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 120 void *ve, uint32_t desc) 121 { 122 uintptr_t opr_sz = simd_oprsz(desc); 123 int16_t *d = vd; 124 int16_t *n = vn; 125 int16_t *m = vm; 126 CPUARMState *env = ve; 127 uintptr_t i; 128 129 for (i = 0; i < opr_sz / 2; ++i) { 130 d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]); 131 } 132 clear_tail(d, opr_sz, simd_maxsz(desc)); 133 } 134 135 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 136 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 137 int32_t src2, int32_t src3) 138 { 139 /* Simplify similarly to int_qrdmlah_s16 above. */ 140 int64_t ret = (int64_t)src1 * src2; 141 ret = ((int64_t)src3 << 31) + ret + (1 << 30); 142 ret >>= 31; 143 if (ret != (int32_t)ret) { 144 SET_QC(); 145 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 146 } 147 return ret; 148 } 149 150 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 151 void *ve, uint32_t desc) 152 { 153 uintptr_t opr_sz = simd_oprsz(desc); 154 int32_t *d = vd; 155 int32_t *n = vn; 156 int32_t *m = vm; 157 CPUARMState *env = ve; 158 uintptr_t i; 159 160 for (i = 0; i < opr_sz / 4; ++i) { 161 d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]); 162 } 163 clear_tail(d, opr_sz, simd_maxsz(desc)); 164 } 165 166 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ 167 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 168 int32_t src2, int32_t src3) 169 { 170 /* Simplify similarly to int_qrdmlsh_s16 above. */ 171 int64_t ret = (int64_t)src1 * src2; 172 ret = ((int64_t)src3 << 31) - ret + (1 << 30); 173 ret >>= 31; 174 if (ret != (int32_t)ret) { 175 SET_QC(); 176 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 177 } 178 return ret; 179 } 180 181 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 182 void *ve, uint32_t desc) 183 { 184 uintptr_t opr_sz = simd_oprsz(desc); 185 int32_t *d = vd; 186 int32_t *n = vn; 187 int32_t *m = vm; 188 CPUARMState *env = ve; 189 uintptr_t i; 190 191 for (i = 0; i < opr_sz / 4; ++i) { 192 d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]); 193 } 194 clear_tail(d, opr_sz, simd_maxsz(desc)); 195 } 196 197 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 198 void *vfpst, uint32_t desc) 199 { 200 uintptr_t opr_sz = simd_oprsz(desc); 201 float16 *d = vd; 202 float16 *n = vn; 203 float16 *m = vm; 204 float_status *fpst = vfpst; 205 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 206 uint32_t neg_imag = neg_real ^ 1; 207 uintptr_t i; 208 209 /* Shift boolean to the sign bit so we can xor to negate. */ 210 neg_real <<= 15; 211 neg_imag <<= 15; 212 213 for (i = 0; i < opr_sz / 2; i += 2) { 214 float16 e0 = n[H2(i)]; 215 float16 e1 = m[H2(i + 1)] ^ neg_imag; 216 float16 e2 = n[H2(i + 1)]; 217 float16 e3 = m[H2(i)] ^ neg_real; 218 219 d[H2(i)] = float16_add(e0, e1, fpst); 220 d[H2(i + 1)] = float16_add(e2, e3, fpst); 221 } 222 clear_tail(d, opr_sz, simd_maxsz(desc)); 223 } 224 225 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 226 void *vfpst, uint32_t desc) 227 { 228 uintptr_t opr_sz = simd_oprsz(desc); 229 float32 *d = vd; 230 float32 *n = vn; 231 float32 *m = vm; 232 float_status *fpst = vfpst; 233 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 234 uint32_t neg_imag = neg_real ^ 1; 235 uintptr_t i; 236 237 /* Shift boolean to the sign bit so we can xor to negate. */ 238 neg_real <<= 31; 239 neg_imag <<= 31; 240 241 for (i = 0; i < opr_sz / 4; i += 2) { 242 float32 e0 = n[H4(i)]; 243 float32 e1 = m[H4(i + 1)] ^ neg_imag; 244 float32 e2 = n[H4(i + 1)]; 245 float32 e3 = m[H4(i)] ^ neg_real; 246 247 d[H4(i)] = float32_add(e0, e1, fpst); 248 d[H4(i + 1)] = float32_add(e2, e3, fpst); 249 } 250 clear_tail(d, opr_sz, simd_maxsz(desc)); 251 } 252 253 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 254 void *vfpst, uint32_t desc) 255 { 256 uintptr_t opr_sz = simd_oprsz(desc); 257 float64 *d = vd; 258 float64 *n = vn; 259 float64 *m = vm; 260 float_status *fpst = vfpst; 261 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 262 uint64_t neg_imag = neg_real ^ 1; 263 uintptr_t i; 264 265 /* Shift boolean to the sign bit so we can xor to negate. */ 266 neg_real <<= 63; 267 neg_imag <<= 63; 268 269 for (i = 0; i < opr_sz / 8; i += 2) { 270 float64 e0 = n[i]; 271 float64 e1 = m[i + 1] ^ neg_imag; 272 float64 e2 = n[i + 1]; 273 float64 e3 = m[i] ^ neg_real; 274 275 d[i] = float64_add(e0, e1, fpst); 276 d[i + 1] = float64_add(e2, e3, fpst); 277 } 278 clear_tail(d, opr_sz, simd_maxsz(desc)); 279 } 280 281 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, 282 void *vfpst, uint32_t desc) 283 { 284 uintptr_t opr_sz = simd_oprsz(desc); 285 float16 *d = vd; 286 float16 *n = vn; 287 float16 *m = vm; 288 float_status *fpst = vfpst; 289 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 290 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 291 uint32_t neg_real = flip ^ neg_imag; 292 uintptr_t i; 293 294 /* Shift boolean to the sign bit so we can xor to negate. */ 295 neg_real <<= 15; 296 neg_imag <<= 15; 297 298 for (i = 0; i < opr_sz / 2; i += 2) { 299 float16 e2 = n[H2(i + flip)]; 300 float16 e1 = m[H2(i + flip)] ^ neg_real; 301 float16 e4 = e2; 302 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 303 304 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 305 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 306 } 307 clear_tail(d, opr_sz, simd_maxsz(desc)); 308 } 309 310 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, 311 void *vfpst, uint32_t desc) 312 { 313 uintptr_t opr_sz = simd_oprsz(desc); 314 float16 *d = vd; 315 float16 *n = vn; 316 float16 *m = vm; 317 float_status *fpst = vfpst; 318 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 319 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 320 uint32_t neg_real = flip ^ neg_imag; 321 uintptr_t i; 322 float16 e1 = m[H2(flip)]; 323 float16 e3 = m[H2(1 - flip)]; 324 325 /* Shift boolean to the sign bit so we can xor to negate. */ 326 neg_real <<= 15; 327 neg_imag <<= 15; 328 e1 ^= neg_real; 329 e3 ^= neg_imag; 330 331 for (i = 0; i < opr_sz / 2; i += 2) { 332 float16 e2 = n[H2(i + flip)]; 333 float16 e4 = e2; 334 335 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 336 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 337 } 338 clear_tail(d, opr_sz, simd_maxsz(desc)); 339 } 340 341 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, 342 void *vfpst, uint32_t desc) 343 { 344 uintptr_t opr_sz = simd_oprsz(desc); 345 float32 *d = vd; 346 float32 *n = vn; 347 float32 *m = vm; 348 float_status *fpst = vfpst; 349 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 350 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 351 uint32_t neg_real = flip ^ neg_imag; 352 uintptr_t i; 353 354 /* Shift boolean to the sign bit so we can xor to negate. */ 355 neg_real <<= 31; 356 neg_imag <<= 31; 357 358 for (i = 0; i < opr_sz / 4; i += 2) { 359 float32 e2 = n[H4(i + flip)]; 360 float32 e1 = m[H4(i + flip)] ^ neg_real; 361 float32 e4 = e2; 362 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 363 364 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 365 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 366 } 367 clear_tail(d, opr_sz, simd_maxsz(desc)); 368 } 369 370 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, 371 void *vfpst, uint32_t desc) 372 { 373 uintptr_t opr_sz = simd_oprsz(desc); 374 float32 *d = vd; 375 float32 *n = vn; 376 float32 *m = vm; 377 float_status *fpst = vfpst; 378 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 379 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 380 uint32_t neg_real = flip ^ neg_imag; 381 uintptr_t i; 382 float32 e1 = m[H4(flip)]; 383 float32 e3 = m[H4(1 - flip)]; 384 385 /* Shift boolean to the sign bit so we can xor to negate. */ 386 neg_real <<= 31; 387 neg_imag <<= 31; 388 e1 ^= neg_real; 389 e3 ^= neg_imag; 390 391 for (i = 0; i < opr_sz / 4; i += 2) { 392 float32 e2 = n[H4(i + flip)]; 393 float32 e4 = e2; 394 395 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 396 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 397 } 398 clear_tail(d, opr_sz, simd_maxsz(desc)); 399 } 400 401 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, 402 void *vfpst, uint32_t desc) 403 { 404 uintptr_t opr_sz = simd_oprsz(desc); 405 float64 *d = vd; 406 float64 *n = vn; 407 float64 *m = vm; 408 float_status *fpst = vfpst; 409 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 410 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 411 uint64_t neg_real = flip ^ neg_imag; 412 uintptr_t i; 413 414 /* Shift boolean to the sign bit so we can xor to negate. */ 415 neg_real <<= 63; 416 neg_imag <<= 63; 417 418 for (i = 0; i < opr_sz / 8; i += 2) { 419 float64 e2 = n[i + flip]; 420 float64 e1 = m[i + flip] ^ neg_real; 421 float64 e4 = e2; 422 float64 e3 = m[i + 1 - flip] ^ neg_imag; 423 424 d[i] = float64_muladd(e2, e1, d[i], 0, fpst); 425 d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); 426 } 427 clear_tail(d, opr_sz, simd_maxsz(desc)); 428 } 429 430 /* Floating-point trigonometric starting value. 431 * See the ARM ARM pseudocode function FPTrigSMul. 432 */ 433 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 434 { 435 float16 result = float16_mul(op1, op1, stat); 436 if (!float16_is_any_nan(result)) { 437 result = float16_set_sign(result, op2 & 1); 438 } 439 return result; 440 } 441 442 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 443 { 444 float32 result = float32_mul(op1, op1, stat); 445 if (!float32_is_any_nan(result)) { 446 result = float32_set_sign(result, op2 & 1); 447 } 448 return result; 449 } 450 451 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 452 { 453 float64 result = float64_mul(op1, op1, stat); 454 if (!float64_is_any_nan(result)) { 455 result = float64_set_sign(result, op2 & 1); 456 } 457 return result; 458 } 459 460 #define DO_3OP(NAME, FUNC, TYPE) \ 461 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 462 { \ 463 intptr_t i, oprsz = simd_oprsz(desc); \ 464 TYPE *d = vd, *n = vn, *m = vm; \ 465 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 466 d[i] = FUNC(n[i], m[i], stat); \ 467 } \ 468 } 469 470 DO_3OP(gvec_fadd_h, float16_add, float16) 471 DO_3OP(gvec_fadd_s, float32_add, float32) 472 DO_3OP(gvec_fadd_d, float64_add, float64) 473 474 DO_3OP(gvec_fsub_h, float16_sub, float16) 475 DO_3OP(gvec_fsub_s, float32_sub, float32) 476 DO_3OP(gvec_fsub_d, float64_sub, float64) 477 478 DO_3OP(gvec_fmul_h, float16_mul, float16) 479 DO_3OP(gvec_fmul_s, float32_mul, float32) 480 DO_3OP(gvec_fmul_d, float64_mul, float64) 481 482 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 483 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 484 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 485 486 #ifdef TARGET_AARCH64 487 488 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 489 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 490 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 491 492 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 493 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 494 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 495 496 #endif 497 #undef DO_3OP 498 499 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 500 * For AdvSIMD, there is of course only one such vector segment. 501 */ 502 503 #define DO_MUL_IDX(NAME, TYPE, H) \ 504 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 505 { \ 506 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 507 intptr_t idx = simd_data(desc); \ 508 TYPE *d = vd, *n = vn, *m = vm; \ 509 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 510 TYPE mm = m[H(i + idx)]; \ 511 for (j = 0; j < segment; j++) { \ 512 d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ 513 } \ 514 } \ 515 } 516 517 DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) 518 DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) 519 DO_MUL_IDX(gvec_fmul_idx_d, float64, ) 520 521 #undef DO_MUL_IDX 522 523 #define DO_FMLA_IDX(NAME, TYPE, H) \ 524 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 525 void *stat, uint32_t desc) \ 526 { \ 527 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 528 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 529 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 530 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 531 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 532 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 533 TYPE mm = m[H(i + idx)]; \ 534 for (j = 0; j < segment; j++) { \ 535 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 536 mm, a[i + j], 0, stat); \ 537 } \ 538 } \ 539 } 540 541 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 542 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 543 DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) 544 545 #undef DO_FMLA_IDX 546