1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 26 27 /* Note that vector data is stored in host-endian 64-bit chunks, 28 so addressing units smaller than that needs a host-endian fixup. */ 29 #ifdef HOST_WORDS_BIGENDIAN 30 #define H1(x) ((x) ^ 7) 31 #define H2(x) ((x) ^ 3) 32 #define H4(x) ((x) ^ 1) 33 #else 34 #define H1(x) (x) 35 #define H2(x) (x) 36 #define H4(x) (x) 37 #endif 38 39 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q 40 41 static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 42 { 43 uint64_t *d = vd + opr_sz; 44 uintptr_t i; 45 46 for (i = opr_sz; i < max_sz; i += 8) { 47 *d++ = 0; 48 } 49 } 50 51 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 52 static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1, 53 int16_t src2, int16_t src3) 54 { 55 /* Simplify: 56 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 57 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 58 */ 59 int32_t ret = (int32_t)src1 * src2; 60 ret = ((int32_t)src3 << 15) + ret + (1 << 14); 61 ret >>= 15; 62 if (ret != (int16_t)ret) { 63 SET_QC(); 64 ret = (ret < 0 ? -0x8000 : 0x7fff); 65 } 66 return ret; 67 } 68 69 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 70 uint32_t src2, uint32_t src3) 71 { 72 uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3); 73 uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 74 return deposit32(e1, 16, 16, e2); 75 } 76 77 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 78 void *ve, uint32_t desc) 79 { 80 uintptr_t opr_sz = simd_oprsz(desc); 81 int16_t *d = vd; 82 int16_t *n = vn; 83 int16_t *m = vm; 84 CPUARMState *env = ve; 85 uintptr_t i; 86 87 for (i = 0; i < opr_sz / 2; ++i) { 88 d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]); 89 } 90 clear_tail(d, opr_sz, simd_maxsz(desc)); 91 } 92 93 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ 94 static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1, 95 int16_t src2, int16_t src3) 96 { 97 /* Similarly, using subtraction: 98 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 99 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 100 */ 101 int32_t ret = (int32_t)src1 * src2; 102 ret = ((int32_t)src3 << 15) - ret + (1 << 14); 103 ret >>= 15; 104 if (ret != (int16_t)ret) { 105 SET_QC(); 106 ret = (ret < 0 ? -0x8000 : 0x7fff); 107 } 108 return ret; 109 } 110 111 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 112 uint32_t src2, uint32_t src3) 113 { 114 uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3); 115 uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 116 return deposit32(e1, 16, 16, e2); 117 } 118 119 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 120 void *ve, uint32_t desc) 121 { 122 uintptr_t opr_sz = simd_oprsz(desc); 123 int16_t *d = vd; 124 int16_t *n = vn; 125 int16_t *m = vm; 126 CPUARMState *env = ve; 127 uintptr_t i; 128 129 for (i = 0; i < opr_sz / 2; ++i) { 130 d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]); 131 } 132 clear_tail(d, opr_sz, simd_maxsz(desc)); 133 } 134 135 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 136 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 137 int32_t src2, int32_t src3) 138 { 139 /* Simplify similarly to int_qrdmlah_s16 above. */ 140 int64_t ret = (int64_t)src1 * src2; 141 ret = ((int64_t)src3 << 31) + ret + (1 << 30); 142 ret >>= 31; 143 if (ret != (int32_t)ret) { 144 SET_QC(); 145 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 146 } 147 return ret; 148 } 149 150 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 151 void *ve, uint32_t desc) 152 { 153 uintptr_t opr_sz = simd_oprsz(desc); 154 int32_t *d = vd; 155 int32_t *n = vn; 156 int32_t *m = vm; 157 CPUARMState *env = ve; 158 uintptr_t i; 159 160 for (i = 0; i < opr_sz / 4; ++i) { 161 d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]); 162 } 163 clear_tail(d, opr_sz, simd_maxsz(desc)); 164 } 165 166 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ 167 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 168 int32_t src2, int32_t src3) 169 { 170 /* Simplify similarly to int_qrdmlsh_s16 above. */ 171 int64_t ret = (int64_t)src1 * src2; 172 ret = ((int64_t)src3 << 31) - ret + (1 << 30); 173 ret >>= 31; 174 if (ret != (int32_t)ret) { 175 SET_QC(); 176 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 177 } 178 return ret; 179 } 180 181 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 182 void *ve, uint32_t desc) 183 { 184 uintptr_t opr_sz = simd_oprsz(desc); 185 int32_t *d = vd; 186 int32_t *n = vn; 187 int32_t *m = vm; 188 CPUARMState *env = ve; 189 uintptr_t i; 190 191 for (i = 0; i < opr_sz / 4; ++i) { 192 d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]); 193 } 194 clear_tail(d, opr_sz, simd_maxsz(desc)); 195 } 196 197 /* Integer 8 and 16-bit dot-product. 198 * 199 * Note that for the loops herein, host endianness does not matter 200 * with respect to the ordering of data within the 64-bit lanes. 201 * All elements are treated equally, no matter where they are. 202 */ 203 204 void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) 205 { 206 intptr_t i, opr_sz = simd_oprsz(desc); 207 uint32_t *d = vd; 208 int8_t *n = vn, *m = vm; 209 210 for (i = 0; i < opr_sz / 4; ++i) { 211 d[i] += n[i * 4 + 0] * m[i * 4 + 0] 212 + n[i * 4 + 1] * m[i * 4 + 1] 213 + n[i * 4 + 2] * m[i * 4 + 2] 214 + n[i * 4 + 3] * m[i * 4 + 3]; 215 } 216 clear_tail(d, opr_sz, simd_maxsz(desc)); 217 } 218 219 void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) 220 { 221 intptr_t i, opr_sz = simd_oprsz(desc); 222 uint32_t *d = vd; 223 uint8_t *n = vn, *m = vm; 224 225 for (i = 0; i < opr_sz / 4; ++i) { 226 d[i] += n[i * 4 + 0] * m[i * 4 + 0] 227 + n[i * 4 + 1] * m[i * 4 + 1] 228 + n[i * 4 + 2] * m[i * 4 + 2] 229 + n[i * 4 + 3] * m[i * 4 + 3]; 230 } 231 clear_tail(d, opr_sz, simd_maxsz(desc)); 232 } 233 234 void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) 235 { 236 intptr_t i, opr_sz = simd_oprsz(desc); 237 uint64_t *d = vd; 238 int16_t *n = vn, *m = vm; 239 240 for (i = 0; i < opr_sz / 8; ++i) { 241 d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] 242 + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] 243 + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] 244 + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; 245 } 246 clear_tail(d, opr_sz, simd_maxsz(desc)); 247 } 248 249 void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) 250 { 251 intptr_t i, opr_sz = simd_oprsz(desc); 252 uint64_t *d = vd; 253 uint16_t *n = vn, *m = vm; 254 255 for (i = 0; i < opr_sz / 8; ++i) { 256 d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] 257 + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] 258 + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] 259 + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; 260 } 261 clear_tail(d, opr_sz, simd_maxsz(desc)); 262 } 263 264 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 265 void *vfpst, uint32_t desc) 266 { 267 uintptr_t opr_sz = simd_oprsz(desc); 268 float16 *d = vd; 269 float16 *n = vn; 270 float16 *m = vm; 271 float_status *fpst = vfpst; 272 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 273 uint32_t neg_imag = neg_real ^ 1; 274 uintptr_t i; 275 276 /* Shift boolean to the sign bit so we can xor to negate. */ 277 neg_real <<= 15; 278 neg_imag <<= 15; 279 280 for (i = 0; i < opr_sz / 2; i += 2) { 281 float16 e0 = n[H2(i)]; 282 float16 e1 = m[H2(i + 1)] ^ neg_imag; 283 float16 e2 = n[H2(i + 1)]; 284 float16 e3 = m[H2(i)] ^ neg_real; 285 286 d[H2(i)] = float16_add(e0, e1, fpst); 287 d[H2(i + 1)] = float16_add(e2, e3, fpst); 288 } 289 clear_tail(d, opr_sz, simd_maxsz(desc)); 290 } 291 292 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 293 void *vfpst, uint32_t desc) 294 { 295 uintptr_t opr_sz = simd_oprsz(desc); 296 float32 *d = vd; 297 float32 *n = vn; 298 float32 *m = vm; 299 float_status *fpst = vfpst; 300 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 301 uint32_t neg_imag = neg_real ^ 1; 302 uintptr_t i; 303 304 /* Shift boolean to the sign bit so we can xor to negate. */ 305 neg_real <<= 31; 306 neg_imag <<= 31; 307 308 for (i = 0; i < opr_sz / 4; i += 2) { 309 float32 e0 = n[H4(i)]; 310 float32 e1 = m[H4(i + 1)] ^ neg_imag; 311 float32 e2 = n[H4(i + 1)]; 312 float32 e3 = m[H4(i)] ^ neg_real; 313 314 d[H4(i)] = float32_add(e0, e1, fpst); 315 d[H4(i + 1)] = float32_add(e2, e3, fpst); 316 } 317 clear_tail(d, opr_sz, simd_maxsz(desc)); 318 } 319 320 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 321 void *vfpst, uint32_t desc) 322 { 323 uintptr_t opr_sz = simd_oprsz(desc); 324 float64 *d = vd; 325 float64 *n = vn; 326 float64 *m = vm; 327 float_status *fpst = vfpst; 328 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 329 uint64_t neg_imag = neg_real ^ 1; 330 uintptr_t i; 331 332 /* Shift boolean to the sign bit so we can xor to negate. */ 333 neg_real <<= 63; 334 neg_imag <<= 63; 335 336 for (i = 0; i < opr_sz / 8; i += 2) { 337 float64 e0 = n[i]; 338 float64 e1 = m[i + 1] ^ neg_imag; 339 float64 e2 = n[i + 1]; 340 float64 e3 = m[i] ^ neg_real; 341 342 d[i] = float64_add(e0, e1, fpst); 343 d[i + 1] = float64_add(e2, e3, fpst); 344 } 345 clear_tail(d, opr_sz, simd_maxsz(desc)); 346 } 347 348 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, 349 void *vfpst, uint32_t desc) 350 { 351 uintptr_t opr_sz = simd_oprsz(desc); 352 float16 *d = vd; 353 float16 *n = vn; 354 float16 *m = vm; 355 float_status *fpst = vfpst; 356 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 357 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 358 uint32_t neg_real = flip ^ neg_imag; 359 uintptr_t i; 360 361 /* Shift boolean to the sign bit so we can xor to negate. */ 362 neg_real <<= 15; 363 neg_imag <<= 15; 364 365 for (i = 0; i < opr_sz / 2; i += 2) { 366 float16 e2 = n[H2(i + flip)]; 367 float16 e1 = m[H2(i + flip)] ^ neg_real; 368 float16 e4 = e2; 369 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 370 371 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 372 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 373 } 374 clear_tail(d, opr_sz, simd_maxsz(desc)); 375 } 376 377 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, 378 void *vfpst, uint32_t desc) 379 { 380 uintptr_t opr_sz = simd_oprsz(desc); 381 float16 *d = vd; 382 float16 *n = vn; 383 float16 *m = vm; 384 float_status *fpst = vfpst; 385 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 386 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 387 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 388 uint32_t neg_real = flip ^ neg_imag; 389 intptr_t elements = opr_sz / sizeof(float16); 390 intptr_t eltspersegment = 16 / sizeof(float16); 391 intptr_t i, j; 392 393 /* Shift boolean to the sign bit so we can xor to negate. */ 394 neg_real <<= 15; 395 neg_imag <<= 15; 396 397 for (i = 0; i < elements; i += eltspersegment) { 398 float16 mr = m[H2(i + 2 * index + 0)]; 399 float16 mi = m[H2(i + 2 * index + 1)]; 400 float16 e1 = neg_real ^ (flip ? mi : mr); 401 float16 e3 = neg_imag ^ (flip ? mr : mi); 402 403 for (j = i; j < i + eltspersegment; j += 2) { 404 float16 e2 = n[H2(j + flip)]; 405 float16 e4 = e2; 406 407 d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst); 408 d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst); 409 } 410 } 411 clear_tail(d, opr_sz, simd_maxsz(desc)); 412 } 413 414 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, 415 void *vfpst, uint32_t desc) 416 { 417 uintptr_t opr_sz = simd_oprsz(desc); 418 float32 *d = vd; 419 float32 *n = vn; 420 float32 *m = vm; 421 float_status *fpst = vfpst; 422 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 423 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 424 uint32_t neg_real = flip ^ neg_imag; 425 uintptr_t i; 426 427 /* Shift boolean to the sign bit so we can xor to negate. */ 428 neg_real <<= 31; 429 neg_imag <<= 31; 430 431 for (i = 0; i < opr_sz / 4; i += 2) { 432 float32 e2 = n[H4(i + flip)]; 433 float32 e1 = m[H4(i + flip)] ^ neg_real; 434 float32 e4 = e2; 435 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 436 437 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 438 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 439 } 440 clear_tail(d, opr_sz, simd_maxsz(desc)); 441 } 442 443 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, 444 void *vfpst, uint32_t desc) 445 { 446 uintptr_t opr_sz = simd_oprsz(desc); 447 float32 *d = vd; 448 float32 *n = vn; 449 float32 *m = vm; 450 float_status *fpst = vfpst; 451 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 452 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 453 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 454 uint32_t neg_real = flip ^ neg_imag; 455 intptr_t elements = opr_sz / sizeof(float32); 456 intptr_t eltspersegment = 16 / sizeof(float32); 457 intptr_t i, j; 458 459 /* Shift boolean to the sign bit so we can xor to negate. */ 460 neg_real <<= 31; 461 neg_imag <<= 31; 462 463 for (i = 0; i < elements; i += eltspersegment) { 464 float32 mr = m[H4(i + 2 * index + 0)]; 465 float32 mi = m[H4(i + 2 * index + 1)]; 466 float32 e1 = neg_real ^ (flip ? mi : mr); 467 float32 e3 = neg_imag ^ (flip ? mr : mi); 468 469 for (j = i; j < i + eltspersegment; j += 2) { 470 float32 e2 = n[H4(j + flip)]; 471 float32 e4 = e2; 472 473 d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst); 474 d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst); 475 } 476 } 477 clear_tail(d, opr_sz, simd_maxsz(desc)); 478 } 479 480 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, 481 void *vfpst, uint32_t desc) 482 { 483 uintptr_t opr_sz = simd_oprsz(desc); 484 float64 *d = vd; 485 float64 *n = vn; 486 float64 *m = vm; 487 float_status *fpst = vfpst; 488 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 489 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 490 uint64_t neg_real = flip ^ neg_imag; 491 uintptr_t i; 492 493 /* Shift boolean to the sign bit so we can xor to negate. */ 494 neg_real <<= 63; 495 neg_imag <<= 63; 496 497 for (i = 0; i < opr_sz / 8; i += 2) { 498 float64 e2 = n[i + flip]; 499 float64 e1 = m[i + flip] ^ neg_real; 500 float64 e4 = e2; 501 float64 e3 = m[i + 1 - flip] ^ neg_imag; 502 503 d[i] = float64_muladd(e2, e1, d[i], 0, fpst); 504 d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); 505 } 506 clear_tail(d, opr_sz, simd_maxsz(desc)); 507 } 508 509 #define DO_2OP(NAME, FUNC, TYPE) \ 510 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 511 { \ 512 intptr_t i, oprsz = simd_oprsz(desc); \ 513 TYPE *d = vd, *n = vn; \ 514 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 515 d[i] = FUNC(n[i], stat); \ 516 } \ 517 } 518 519 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 520 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 521 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 522 523 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 524 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 525 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 526 527 #undef DO_2OP 528 529 /* Floating-point trigonometric starting value. 530 * See the ARM ARM pseudocode function FPTrigSMul. 531 */ 532 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 533 { 534 float16 result = float16_mul(op1, op1, stat); 535 if (!float16_is_any_nan(result)) { 536 result = float16_set_sign(result, op2 & 1); 537 } 538 return result; 539 } 540 541 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 542 { 543 float32 result = float32_mul(op1, op1, stat); 544 if (!float32_is_any_nan(result)) { 545 result = float32_set_sign(result, op2 & 1); 546 } 547 return result; 548 } 549 550 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 551 { 552 float64 result = float64_mul(op1, op1, stat); 553 if (!float64_is_any_nan(result)) { 554 result = float64_set_sign(result, op2 & 1); 555 } 556 return result; 557 } 558 559 #define DO_3OP(NAME, FUNC, TYPE) \ 560 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 561 { \ 562 intptr_t i, oprsz = simd_oprsz(desc); \ 563 TYPE *d = vd, *n = vn, *m = vm; \ 564 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 565 d[i] = FUNC(n[i], m[i], stat); \ 566 } \ 567 } 568 569 DO_3OP(gvec_fadd_h, float16_add, float16) 570 DO_3OP(gvec_fadd_s, float32_add, float32) 571 DO_3OP(gvec_fadd_d, float64_add, float64) 572 573 DO_3OP(gvec_fsub_h, float16_sub, float16) 574 DO_3OP(gvec_fsub_s, float32_sub, float32) 575 DO_3OP(gvec_fsub_d, float64_sub, float64) 576 577 DO_3OP(gvec_fmul_h, float16_mul, float16) 578 DO_3OP(gvec_fmul_s, float32_mul, float32) 579 DO_3OP(gvec_fmul_d, float64_mul, float64) 580 581 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 582 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 583 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 584 585 #ifdef TARGET_AARCH64 586 587 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 588 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 589 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 590 591 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 592 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 593 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 594 595 #endif 596 #undef DO_3OP 597 598 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 599 * For AdvSIMD, there is of course only one such vector segment. 600 */ 601 602 #define DO_MUL_IDX(NAME, TYPE, H) \ 603 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 604 { \ 605 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 606 intptr_t idx = simd_data(desc); \ 607 TYPE *d = vd, *n = vn, *m = vm; \ 608 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 609 TYPE mm = m[H(i + idx)]; \ 610 for (j = 0; j < segment; j++) { \ 611 d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ 612 } \ 613 } \ 614 } 615 616 DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) 617 DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) 618 DO_MUL_IDX(gvec_fmul_idx_d, float64, ) 619 620 #undef DO_MUL_IDX 621 622 #define DO_FMLA_IDX(NAME, TYPE, H) \ 623 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 624 void *stat, uint32_t desc) \ 625 { \ 626 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 627 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 628 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 629 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 630 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 631 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 632 TYPE mm = m[H(i + idx)]; \ 633 for (j = 0; j < segment; j++) { \ 634 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 635 mm, a[i + j], 0, stat); \ 636 } \ 637 } \ 638 } 639 640 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 641 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 642 DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) 643 644 #undef DO_FMLA_IDX 645