1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 26 27 /* Note that vector data is stored in host-endian 64-bit chunks, 28 so addressing units smaller than that needs a host-endian fixup. */ 29 #ifdef HOST_WORDS_BIGENDIAN 30 #define H1(x) ((x) ^ 7) 31 #define H2(x) ((x) ^ 3) 32 #define H4(x) ((x) ^ 1) 33 #else 34 #define H1(x) (x) 35 #define H2(x) (x) 36 #define H4(x) (x) 37 #endif 38 39 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q 40 41 static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 42 { 43 uint64_t *d = vd + opr_sz; 44 uintptr_t i; 45 46 for (i = opr_sz; i < max_sz; i += 8) { 47 *d++ = 0; 48 } 49 } 50 51 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 52 static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1, 53 int16_t src2, int16_t src3) 54 { 55 /* Simplify: 56 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 57 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 58 */ 59 int32_t ret = (int32_t)src1 * src2; 60 ret = ((int32_t)src3 << 15) + ret + (1 << 14); 61 ret >>= 15; 62 if (ret != (int16_t)ret) { 63 SET_QC(); 64 ret = (ret < 0 ? -0x8000 : 0x7fff); 65 } 66 return ret; 67 } 68 69 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 70 uint32_t src2, uint32_t src3) 71 { 72 uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3); 73 uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 74 return deposit32(e1, 16, 16, e2); 75 } 76 77 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 78 void *ve, uint32_t desc) 79 { 80 uintptr_t opr_sz = simd_oprsz(desc); 81 int16_t *d = vd; 82 int16_t *n = vn; 83 int16_t *m = vm; 84 CPUARMState *env = ve; 85 uintptr_t i; 86 87 for (i = 0; i < opr_sz / 2; ++i) { 88 d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]); 89 } 90 clear_tail(d, opr_sz, simd_maxsz(desc)); 91 } 92 93 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ 94 static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1, 95 int16_t src2, int16_t src3) 96 { 97 /* Similarly, using subtraction: 98 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 99 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 100 */ 101 int32_t ret = (int32_t)src1 * src2; 102 ret = ((int32_t)src3 << 15) - ret + (1 << 14); 103 ret >>= 15; 104 if (ret != (int16_t)ret) { 105 SET_QC(); 106 ret = (ret < 0 ? -0x8000 : 0x7fff); 107 } 108 return ret; 109 } 110 111 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 112 uint32_t src2, uint32_t src3) 113 { 114 uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3); 115 uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 116 return deposit32(e1, 16, 16, e2); 117 } 118 119 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 120 void *ve, uint32_t desc) 121 { 122 uintptr_t opr_sz = simd_oprsz(desc); 123 int16_t *d = vd; 124 int16_t *n = vn; 125 int16_t *m = vm; 126 CPUARMState *env = ve; 127 uintptr_t i; 128 129 for (i = 0; i < opr_sz / 2; ++i) { 130 d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]); 131 } 132 clear_tail(d, opr_sz, simd_maxsz(desc)); 133 } 134 135 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 136 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 137 int32_t src2, int32_t src3) 138 { 139 /* Simplify similarly to int_qrdmlah_s16 above. */ 140 int64_t ret = (int64_t)src1 * src2; 141 ret = ((int64_t)src3 << 31) + ret + (1 << 30); 142 ret >>= 31; 143 if (ret != (int32_t)ret) { 144 SET_QC(); 145 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 146 } 147 return ret; 148 } 149 150 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 151 void *ve, uint32_t desc) 152 { 153 uintptr_t opr_sz = simd_oprsz(desc); 154 int32_t *d = vd; 155 int32_t *n = vn; 156 int32_t *m = vm; 157 CPUARMState *env = ve; 158 uintptr_t i; 159 160 for (i = 0; i < opr_sz / 4; ++i) { 161 d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]); 162 } 163 clear_tail(d, opr_sz, simd_maxsz(desc)); 164 } 165 166 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ 167 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 168 int32_t src2, int32_t src3) 169 { 170 /* Simplify similarly to int_qrdmlsh_s16 above. */ 171 int64_t ret = (int64_t)src1 * src2; 172 ret = ((int64_t)src3 << 31) - ret + (1 << 30); 173 ret >>= 31; 174 if (ret != (int32_t)ret) { 175 SET_QC(); 176 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 177 } 178 return ret; 179 } 180 181 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 182 void *ve, uint32_t desc) 183 { 184 uintptr_t opr_sz = simd_oprsz(desc); 185 int32_t *d = vd; 186 int32_t *n = vn; 187 int32_t *m = vm; 188 CPUARMState *env = ve; 189 uintptr_t i; 190 191 for (i = 0; i < opr_sz / 4; ++i) { 192 d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]); 193 } 194 clear_tail(d, opr_sz, simd_maxsz(desc)); 195 } 196 197 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 198 void *vfpst, uint32_t desc) 199 { 200 uintptr_t opr_sz = simd_oprsz(desc); 201 float16 *d = vd; 202 float16 *n = vn; 203 float16 *m = vm; 204 float_status *fpst = vfpst; 205 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 206 uint32_t neg_imag = neg_real ^ 1; 207 uintptr_t i; 208 209 /* Shift boolean to the sign bit so we can xor to negate. */ 210 neg_real <<= 15; 211 neg_imag <<= 15; 212 213 for (i = 0; i < opr_sz / 2; i += 2) { 214 float16 e0 = n[H2(i)]; 215 float16 e1 = m[H2(i + 1)] ^ neg_imag; 216 float16 e2 = n[H2(i + 1)]; 217 float16 e3 = m[H2(i)] ^ neg_real; 218 219 d[H2(i)] = float16_add(e0, e1, fpst); 220 d[H2(i + 1)] = float16_add(e2, e3, fpst); 221 } 222 clear_tail(d, opr_sz, simd_maxsz(desc)); 223 } 224 225 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 226 void *vfpst, uint32_t desc) 227 { 228 uintptr_t opr_sz = simd_oprsz(desc); 229 float32 *d = vd; 230 float32 *n = vn; 231 float32 *m = vm; 232 float_status *fpst = vfpst; 233 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 234 uint32_t neg_imag = neg_real ^ 1; 235 uintptr_t i; 236 237 /* Shift boolean to the sign bit so we can xor to negate. */ 238 neg_real <<= 31; 239 neg_imag <<= 31; 240 241 for (i = 0; i < opr_sz / 4; i += 2) { 242 float32 e0 = n[H4(i)]; 243 float32 e1 = m[H4(i + 1)] ^ neg_imag; 244 float32 e2 = n[H4(i + 1)]; 245 float32 e3 = m[H4(i)] ^ neg_real; 246 247 d[H4(i)] = float32_add(e0, e1, fpst); 248 d[H4(i + 1)] = float32_add(e2, e3, fpst); 249 } 250 clear_tail(d, opr_sz, simd_maxsz(desc)); 251 } 252 253 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 254 void *vfpst, uint32_t desc) 255 { 256 uintptr_t opr_sz = simd_oprsz(desc); 257 float64 *d = vd; 258 float64 *n = vn; 259 float64 *m = vm; 260 float_status *fpst = vfpst; 261 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 262 uint64_t neg_imag = neg_real ^ 1; 263 uintptr_t i; 264 265 /* Shift boolean to the sign bit so we can xor to negate. */ 266 neg_real <<= 63; 267 neg_imag <<= 63; 268 269 for (i = 0; i < opr_sz / 8; i += 2) { 270 float64 e0 = n[i]; 271 float64 e1 = m[i + 1] ^ neg_imag; 272 float64 e2 = n[i + 1]; 273 float64 e3 = m[i] ^ neg_real; 274 275 d[i] = float64_add(e0, e1, fpst); 276 d[i + 1] = float64_add(e2, e3, fpst); 277 } 278 clear_tail(d, opr_sz, simd_maxsz(desc)); 279 } 280 281 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, 282 void *vfpst, uint32_t desc) 283 { 284 uintptr_t opr_sz = simd_oprsz(desc); 285 float16 *d = vd; 286 float16 *n = vn; 287 float16 *m = vm; 288 float_status *fpst = vfpst; 289 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 290 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 291 uint32_t neg_real = flip ^ neg_imag; 292 uintptr_t i; 293 294 /* Shift boolean to the sign bit so we can xor to negate. */ 295 neg_real <<= 15; 296 neg_imag <<= 15; 297 298 for (i = 0; i < opr_sz / 2; i += 2) { 299 float16 e2 = n[H2(i + flip)]; 300 float16 e1 = m[H2(i + flip)] ^ neg_real; 301 float16 e4 = e2; 302 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 303 304 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 305 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 306 } 307 clear_tail(d, opr_sz, simd_maxsz(desc)); 308 } 309 310 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, 311 void *vfpst, uint32_t desc) 312 { 313 uintptr_t opr_sz = simd_oprsz(desc); 314 float16 *d = vd; 315 float16 *n = vn; 316 float16 *m = vm; 317 float_status *fpst = vfpst; 318 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 319 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 320 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 321 uint32_t neg_real = flip ^ neg_imag; 322 uintptr_t i; 323 float16 e1 = m[H2(2 * index + flip)]; 324 float16 e3 = m[H2(2 * index + 1 - flip)]; 325 326 /* Shift boolean to the sign bit so we can xor to negate. */ 327 neg_real <<= 15; 328 neg_imag <<= 15; 329 e1 ^= neg_real; 330 e3 ^= neg_imag; 331 332 for (i = 0; i < opr_sz / 2; i += 2) { 333 float16 e2 = n[H2(i + flip)]; 334 float16 e4 = e2; 335 336 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 337 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 338 } 339 clear_tail(d, opr_sz, simd_maxsz(desc)); 340 } 341 342 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, 343 void *vfpst, uint32_t desc) 344 { 345 uintptr_t opr_sz = simd_oprsz(desc); 346 float32 *d = vd; 347 float32 *n = vn; 348 float32 *m = vm; 349 float_status *fpst = vfpst; 350 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 351 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 352 uint32_t neg_real = flip ^ neg_imag; 353 uintptr_t i; 354 355 /* Shift boolean to the sign bit so we can xor to negate. */ 356 neg_real <<= 31; 357 neg_imag <<= 31; 358 359 for (i = 0; i < opr_sz / 4; i += 2) { 360 float32 e2 = n[H4(i + flip)]; 361 float32 e1 = m[H4(i + flip)] ^ neg_real; 362 float32 e4 = e2; 363 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 364 365 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 366 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 367 } 368 clear_tail(d, opr_sz, simd_maxsz(desc)); 369 } 370 371 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, 372 void *vfpst, uint32_t desc) 373 { 374 uintptr_t opr_sz = simd_oprsz(desc); 375 float32 *d = vd; 376 float32 *n = vn; 377 float32 *m = vm; 378 float_status *fpst = vfpst; 379 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 380 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 381 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 382 uint32_t neg_real = flip ^ neg_imag; 383 uintptr_t i; 384 float32 e1 = m[H4(2 * index + flip)]; 385 float32 e3 = m[H4(2 * index + 1 - flip)]; 386 387 /* Shift boolean to the sign bit so we can xor to negate. */ 388 neg_real <<= 31; 389 neg_imag <<= 31; 390 e1 ^= neg_real; 391 e3 ^= neg_imag; 392 393 for (i = 0; i < opr_sz / 4; i += 2) { 394 float32 e2 = n[H4(i + flip)]; 395 float32 e4 = e2; 396 397 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 398 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 399 } 400 clear_tail(d, opr_sz, simd_maxsz(desc)); 401 } 402 403 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, 404 void *vfpst, uint32_t desc) 405 { 406 uintptr_t opr_sz = simd_oprsz(desc); 407 float64 *d = vd; 408 float64 *n = vn; 409 float64 *m = vm; 410 float_status *fpst = vfpst; 411 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 412 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 413 uint64_t neg_real = flip ^ neg_imag; 414 uintptr_t i; 415 416 /* Shift boolean to the sign bit so we can xor to negate. */ 417 neg_real <<= 63; 418 neg_imag <<= 63; 419 420 for (i = 0; i < opr_sz / 8; i += 2) { 421 float64 e2 = n[i + flip]; 422 float64 e1 = m[i + flip] ^ neg_real; 423 float64 e4 = e2; 424 float64 e3 = m[i + 1 - flip] ^ neg_imag; 425 426 d[i] = float64_muladd(e2, e1, d[i], 0, fpst); 427 d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); 428 } 429 clear_tail(d, opr_sz, simd_maxsz(desc)); 430 } 431 432 #define DO_2OP(NAME, FUNC, TYPE) \ 433 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 434 { \ 435 intptr_t i, oprsz = simd_oprsz(desc); \ 436 TYPE *d = vd, *n = vn; \ 437 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 438 d[i] = FUNC(n[i], stat); \ 439 } \ 440 } 441 442 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 443 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 444 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 445 446 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 447 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 448 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 449 450 #undef DO_2OP 451 452 /* Floating-point trigonometric starting value. 453 * See the ARM ARM pseudocode function FPTrigSMul. 454 */ 455 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 456 { 457 float16 result = float16_mul(op1, op1, stat); 458 if (!float16_is_any_nan(result)) { 459 result = float16_set_sign(result, op2 & 1); 460 } 461 return result; 462 } 463 464 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 465 { 466 float32 result = float32_mul(op1, op1, stat); 467 if (!float32_is_any_nan(result)) { 468 result = float32_set_sign(result, op2 & 1); 469 } 470 return result; 471 } 472 473 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 474 { 475 float64 result = float64_mul(op1, op1, stat); 476 if (!float64_is_any_nan(result)) { 477 result = float64_set_sign(result, op2 & 1); 478 } 479 return result; 480 } 481 482 #define DO_3OP(NAME, FUNC, TYPE) \ 483 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 484 { \ 485 intptr_t i, oprsz = simd_oprsz(desc); \ 486 TYPE *d = vd, *n = vn, *m = vm; \ 487 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 488 d[i] = FUNC(n[i], m[i], stat); \ 489 } \ 490 } 491 492 DO_3OP(gvec_fadd_h, float16_add, float16) 493 DO_3OP(gvec_fadd_s, float32_add, float32) 494 DO_3OP(gvec_fadd_d, float64_add, float64) 495 496 DO_3OP(gvec_fsub_h, float16_sub, float16) 497 DO_3OP(gvec_fsub_s, float32_sub, float32) 498 DO_3OP(gvec_fsub_d, float64_sub, float64) 499 500 DO_3OP(gvec_fmul_h, float16_mul, float16) 501 DO_3OP(gvec_fmul_s, float32_mul, float32) 502 DO_3OP(gvec_fmul_d, float64_mul, float64) 503 504 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 505 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 506 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 507 508 #ifdef TARGET_AARCH64 509 510 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 511 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 512 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 513 514 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 515 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 516 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 517 518 #endif 519 #undef DO_3OP 520 521 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 522 * For AdvSIMD, there is of course only one such vector segment. 523 */ 524 525 #define DO_MUL_IDX(NAME, TYPE, H) \ 526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 527 { \ 528 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 529 intptr_t idx = simd_data(desc); \ 530 TYPE *d = vd, *n = vn, *m = vm; \ 531 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 532 TYPE mm = m[H(i + idx)]; \ 533 for (j = 0; j < segment; j++) { \ 534 d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ 535 } \ 536 } \ 537 } 538 539 DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) 540 DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) 541 DO_MUL_IDX(gvec_fmul_idx_d, float64, ) 542 543 #undef DO_MUL_IDX 544 545 #define DO_FMLA_IDX(NAME, TYPE, H) \ 546 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 547 void *stat, uint32_t desc) \ 548 { \ 549 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 550 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 551 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 552 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 553 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 554 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 555 TYPE mm = m[H(i + idx)]; \ 556 for (j = 0; j < segment; j++) { \ 557 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 558 mm, a[i + j], 0, stat); \ 559 } \ 560 } \ 561 } 562 563 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 564 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 565 DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) 566 567 #undef DO_FMLA_IDX 568