1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 26 27 /* Note that vector data is stored in host-endian 64-bit chunks, 28 so addressing units smaller than that needs a host-endian fixup. */ 29 #ifdef HOST_WORDS_BIGENDIAN 30 #define H1(x) ((x) ^ 7) 31 #define H2(x) ((x) ^ 3) 32 #define H4(x) ((x) ^ 1) 33 #else 34 #define H1(x) (x) 35 #define H2(x) (x) 36 #define H4(x) (x) 37 #endif 38 39 static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 40 { 41 uint64_t *d = vd + opr_sz; 42 uintptr_t i; 43 44 for (i = opr_sz; i < max_sz; i += 8) { 45 *d++ = 0; 46 } 47 } 48 49 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 50 static int16_t inl_qrdmlah_s16(int16_t src1, int16_t src2, 51 int16_t src3, uint32_t *sat) 52 { 53 /* Simplify: 54 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 55 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 56 */ 57 int32_t ret = (int32_t)src1 * src2; 58 ret = ((int32_t)src3 << 15) + ret + (1 << 14); 59 ret >>= 15; 60 if (ret != (int16_t)ret) { 61 *sat = 1; 62 ret = (ret < 0 ? -0x8000 : 0x7fff); 63 } 64 return ret; 65 } 66 67 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 68 uint32_t src2, uint32_t src3) 69 { 70 uint32_t *sat = &env->vfp.qc[0]; 71 uint16_t e1 = inl_qrdmlah_s16(src1, src2, src3, sat); 72 uint16_t e2 = inl_qrdmlah_s16(src1 >> 16, src2 >> 16, src3 >> 16, sat); 73 return deposit32(e1, 16, 16, e2); 74 } 75 76 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 77 void *vq, uint32_t desc) 78 { 79 uintptr_t opr_sz = simd_oprsz(desc); 80 int16_t *d = vd; 81 int16_t *n = vn; 82 int16_t *m = vm; 83 uintptr_t i; 84 85 for (i = 0; i < opr_sz / 2; ++i) { 86 d[i] = inl_qrdmlah_s16(n[i], m[i], d[i], vq); 87 } 88 clear_tail(d, opr_sz, simd_maxsz(desc)); 89 } 90 91 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ 92 static int16_t inl_qrdmlsh_s16(int16_t src1, int16_t src2, 93 int16_t src3, uint32_t *sat) 94 { 95 /* Similarly, using subtraction: 96 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 97 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 98 */ 99 int32_t ret = (int32_t)src1 * src2; 100 ret = ((int32_t)src3 << 15) - ret + (1 << 14); 101 ret >>= 15; 102 if (ret != (int16_t)ret) { 103 *sat = 1; 104 ret = (ret < 0 ? -0x8000 : 0x7fff); 105 } 106 return ret; 107 } 108 109 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 110 uint32_t src2, uint32_t src3) 111 { 112 uint32_t *sat = &env->vfp.qc[0]; 113 uint16_t e1 = inl_qrdmlsh_s16(src1, src2, src3, sat); 114 uint16_t e2 = inl_qrdmlsh_s16(src1 >> 16, src2 >> 16, src3 >> 16, sat); 115 return deposit32(e1, 16, 16, e2); 116 } 117 118 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 119 void *vq, uint32_t desc) 120 { 121 uintptr_t opr_sz = simd_oprsz(desc); 122 int16_t *d = vd; 123 int16_t *n = vn; 124 int16_t *m = vm; 125 uintptr_t i; 126 127 for (i = 0; i < opr_sz / 2; ++i) { 128 d[i] = inl_qrdmlsh_s16(n[i], m[i], d[i], vq); 129 } 130 clear_tail(d, opr_sz, simd_maxsz(desc)); 131 } 132 133 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 134 static int32_t inl_qrdmlah_s32(int32_t src1, int32_t src2, 135 int32_t src3, uint32_t *sat) 136 { 137 /* Simplify similarly to int_qrdmlah_s16 above. */ 138 int64_t ret = (int64_t)src1 * src2; 139 ret = ((int64_t)src3 << 31) + ret + (1 << 30); 140 ret >>= 31; 141 if (ret != (int32_t)ret) { 142 *sat = 1; 143 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 144 } 145 return ret; 146 } 147 148 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 149 int32_t src2, int32_t src3) 150 { 151 uint32_t *sat = &env->vfp.qc[0]; 152 return inl_qrdmlah_s32(src1, src2, src3, sat); 153 } 154 155 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 156 void *vq, uint32_t desc) 157 { 158 uintptr_t opr_sz = simd_oprsz(desc); 159 int32_t *d = vd; 160 int32_t *n = vn; 161 int32_t *m = vm; 162 uintptr_t i; 163 164 for (i = 0; i < opr_sz / 4; ++i) { 165 d[i] = inl_qrdmlah_s32(n[i], m[i], d[i], vq); 166 } 167 clear_tail(d, opr_sz, simd_maxsz(desc)); 168 } 169 170 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ 171 static int32_t inl_qrdmlsh_s32(int32_t src1, int32_t src2, 172 int32_t src3, uint32_t *sat) 173 { 174 /* Simplify similarly to int_qrdmlsh_s16 above. */ 175 int64_t ret = (int64_t)src1 * src2; 176 ret = ((int64_t)src3 << 31) - ret + (1 << 30); 177 ret >>= 31; 178 if (ret != (int32_t)ret) { 179 *sat = 1; 180 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 181 } 182 return ret; 183 } 184 185 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 186 int32_t src2, int32_t src3) 187 { 188 uint32_t *sat = &env->vfp.qc[0]; 189 return inl_qrdmlsh_s32(src1, src2, src3, sat); 190 } 191 192 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 193 void *vq, uint32_t desc) 194 { 195 uintptr_t opr_sz = simd_oprsz(desc); 196 int32_t *d = vd; 197 int32_t *n = vn; 198 int32_t *m = vm; 199 uintptr_t i; 200 201 for (i = 0; i < opr_sz / 4; ++i) { 202 d[i] = inl_qrdmlsh_s32(n[i], m[i], d[i], vq); 203 } 204 clear_tail(d, opr_sz, simd_maxsz(desc)); 205 } 206 207 /* Integer 8 and 16-bit dot-product. 208 * 209 * Note that for the loops herein, host endianness does not matter 210 * with respect to the ordering of data within the 64-bit lanes. 211 * All elements are treated equally, no matter where they are. 212 */ 213 214 void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) 215 { 216 intptr_t i, opr_sz = simd_oprsz(desc); 217 uint32_t *d = vd; 218 int8_t *n = vn, *m = vm; 219 220 for (i = 0; i < opr_sz / 4; ++i) { 221 d[i] += n[i * 4 + 0] * m[i * 4 + 0] 222 + n[i * 4 + 1] * m[i * 4 + 1] 223 + n[i * 4 + 2] * m[i * 4 + 2] 224 + n[i * 4 + 3] * m[i * 4 + 3]; 225 } 226 clear_tail(d, opr_sz, simd_maxsz(desc)); 227 } 228 229 void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) 230 { 231 intptr_t i, opr_sz = simd_oprsz(desc); 232 uint32_t *d = vd; 233 uint8_t *n = vn, *m = vm; 234 235 for (i = 0; i < opr_sz / 4; ++i) { 236 d[i] += n[i * 4 + 0] * m[i * 4 + 0] 237 + n[i * 4 + 1] * m[i * 4 + 1] 238 + n[i * 4 + 2] * m[i * 4 + 2] 239 + n[i * 4 + 3] * m[i * 4 + 3]; 240 } 241 clear_tail(d, opr_sz, simd_maxsz(desc)); 242 } 243 244 void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) 245 { 246 intptr_t i, opr_sz = simd_oprsz(desc); 247 uint64_t *d = vd; 248 int16_t *n = vn, *m = vm; 249 250 for (i = 0; i < opr_sz / 8; ++i) { 251 d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] 252 + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] 253 + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] 254 + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; 255 } 256 clear_tail(d, opr_sz, simd_maxsz(desc)); 257 } 258 259 void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) 260 { 261 intptr_t i, opr_sz = simd_oprsz(desc); 262 uint64_t *d = vd; 263 uint16_t *n = vn, *m = vm; 264 265 for (i = 0; i < opr_sz / 8; ++i) { 266 d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] 267 + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] 268 + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] 269 + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; 270 } 271 clear_tail(d, opr_sz, simd_maxsz(desc)); 272 } 273 274 void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) 275 { 276 intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; 277 intptr_t index = simd_data(desc); 278 uint32_t *d = vd; 279 int8_t *n = vn; 280 int8_t *m_indexed = (int8_t *)vm + index * 4; 281 282 /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. 283 * Otherwise opr_sz is a multiple of 16. 284 */ 285 segend = MIN(4, opr_sz_4); 286 i = 0; 287 do { 288 int8_t m0 = m_indexed[i * 4 + 0]; 289 int8_t m1 = m_indexed[i * 4 + 1]; 290 int8_t m2 = m_indexed[i * 4 + 2]; 291 int8_t m3 = m_indexed[i * 4 + 3]; 292 293 do { 294 d[i] += n[i * 4 + 0] * m0 295 + n[i * 4 + 1] * m1 296 + n[i * 4 + 2] * m2 297 + n[i * 4 + 3] * m3; 298 } while (++i < segend); 299 segend = i + 4; 300 } while (i < opr_sz_4); 301 302 clear_tail(d, opr_sz, simd_maxsz(desc)); 303 } 304 305 void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) 306 { 307 intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; 308 intptr_t index = simd_data(desc); 309 uint32_t *d = vd; 310 uint8_t *n = vn; 311 uint8_t *m_indexed = (uint8_t *)vm + index * 4; 312 313 /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. 314 * Otherwise opr_sz is a multiple of 16. 315 */ 316 segend = MIN(4, opr_sz_4); 317 i = 0; 318 do { 319 uint8_t m0 = m_indexed[i * 4 + 0]; 320 uint8_t m1 = m_indexed[i * 4 + 1]; 321 uint8_t m2 = m_indexed[i * 4 + 2]; 322 uint8_t m3 = m_indexed[i * 4 + 3]; 323 324 do { 325 d[i] += n[i * 4 + 0] * m0 326 + n[i * 4 + 1] * m1 327 + n[i * 4 + 2] * m2 328 + n[i * 4 + 3] * m3; 329 } while (++i < segend); 330 segend = i + 4; 331 } while (i < opr_sz_4); 332 333 clear_tail(d, opr_sz, simd_maxsz(desc)); 334 } 335 336 void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 337 { 338 intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; 339 intptr_t index = simd_data(desc); 340 uint64_t *d = vd; 341 int16_t *n = vn; 342 int16_t *m_indexed = (int16_t *)vm + index * 4; 343 344 /* This is supported by SVE only, so opr_sz is always a multiple of 16. 345 * Process the entire segment all at once, writing back the results 346 * only after we've consumed all of the inputs. 347 */ 348 for (i = 0; i < opr_sz_8 ; i += 2) { 349 uint64_t d0, d1; 350 351 d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; 352 d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; 353 d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; 354 d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; 355 d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; 356 d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; 357 d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; 358 d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; 359 360 d[i + 0] += d0; 361 d[i + 1] += d1; 362 } 363 364 clear_tail(d, opr_sz, simd_maxsz(desc)); 365 } 366 367 void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 368 { 369 intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; 370 intptr_t index = simd_data(desc); 371 uint64_t *d = vd; 372 uint16_t *n = vn; 373 uint16_t *m_indexed = (uint16_t *)vm + index * 4; 374 375 /* This is supported by SVE only, so opr_sz is always a multiple of 16. 376 * Process the entire segment all at once, writing back the results 377 * only after we've consumed all of the inputs. 378 */ 379 for (i = 0; i < opr_sz_8 ; i += 2) { 380 uint64_t d0, d1; 381 382 d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; 383 d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; 384 d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; 385 d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; 386 d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; 387 d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; 388 d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; 389 d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; 390 391 d[i + 0] += d0; 392 d[i + 1] += d1; 393 } 394 395 clear_tail(d, opr_sz, simd_maxsz(desc)); 396 } 397 398 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 399 void *vfpst, uint32_t desc) 400 { 401 uintptr_t opr_sz = simd_oprsz(desc); 402 float16 *d = vd; 403 float16 *n = vn; 404 float16 *m = vm; 405 float_status *fpst = vfpst; 406 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 407 uint32_t neg_imag = neg_real ^ 1; 408 uintptr_t i; 409 410 /* Shift boolean to the sign bit so we can xor to negate. */ 411 neg_real <<= 15; 412 neg_imag <<= 15; 413 414 for (i = 0; i < opr_sz / 2; i += 2) { 415 float16 e0 = n[H2(i)]; 416 float16 e1 = m[H2(i + 1)] ^ neg_imag; 417 float16 e2 = n[H2(i + 1)]; 418 float16 e3 = m[H2(i)] ^ neg_real; 419 420 d[H2(i)] = float16_add(e0, e1, fpst); 421 d[H2(i + 1)] = float16_add(e2, e3, fpst); 422 } 423 clear_tail(d, opr_sz, simd_maxsz(desc)); 424 } 425 426 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 427 void *vfpst, uint32_t desc) 428 { 429 uintptr_t opr_sz = simd_oprsz(desc); 430 float32 *d = vd; 431 float32 *n = vn; 432 float32 *m = vm; 433 float_status *fpst = vfpst; 434 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 435 uint32_t neg_imag = neg_real ^ 1; 436 uintptr_t i; 437 438 /* Shift boolean to the sign bit so we can xor to negate. */ 439 neg_real <<= 31; 440 neg_imag <<= 31; 441 442 for (i = 0; i < opr_sz / 4; i += 2) { 443 float32 e0 = n[H4(i)]; 444 float32 e1 = m[H4(i + 1)] ^ neg_imag; 445 float32 e2 = n[H4(i + 1)]; 446 float32 e3 = m[H4(i)] ^ neg_real; 447 448 d[H4(i)] = float32_add(e0, e1, fpst); 449 d[H4(i + 1)] = float32_add(e2, e3, fpst); 450 } 451 clear_tail(d, opr_sz, simd_maxsz(desc)); 452 } 453 454 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 455 void *vfpst, uint32_t desc) 456 { 457 uintptr_t opr_sz = simd_oprsz(desc); 458 float64 *d = vd; 459 float64 *n = vn; 460 float64 *m = vm; 461 float_status *fpst = vfpst; 462 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 463 uint64_t neg_imag = neg_real ^ 1; 464 uintptr_t i; 465 466 /* Shift boolean to the sign bit so we can xor to negate. */ 467 neg_real <<= 63; 468 neg_imag <<= 63; 469 470 for (i = 0; i < opr_sz / 8; i += 2) { 471 float64 e0 = n[i]; 472 float64 e1 = m[i + 1] ^ neg_imag; 473 float64 e2 = n[i + 1]; 474 float64 e3 = m[i] ^ neg_real; 475 476 d[i] = float64_add(e0, e1, fpst); 477 d[i + 1] = float64_add(e2, e3, fpst); 478 } 479 clear_tail(d, opr_sz, simd_maxsz(desc)); 480 } 481 482 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, 483 void *vfpst, uint32_t desc) 484 { 485 uintptr_t opr_sz = simd_oprsz(desc); 486 float16 *d = vd; 487 float16 *n = vn; 488 float16 *m = vm; 489 float_status *fpst = vfpst; 490 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 491 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 492 uint32_t neg_real = flip ^ neg_imag; 493 uintptr_t i; 494 495 /* Shift boolean to the sign bit so we can xor to negate. */ 496 neg_real <<= 15; 497 neg_imag <<= 15; 498 499 for (i = 0; i < opr_sz / 2; i += 2) { 500 float16 e2 = n[H2(i + flip)]; 501 float16 e1 = m[H2(i + flip)] ^ neg_real; 502 float16 e4 = e2; 503 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 504 505 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 506 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 507 } 508 clear_tail(d, opr_sz, simd_maxsz(desc)); 509 } 510 511 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, 512 void *vfpst, uint32_t desc) 513 { 514 uintptr_t opr_sz = simd_oprsz(desc); 515 float16 *d = vd; 516 float16 *n = vn; 517 float16 *m = vm; 518 float_status *fpst = vfpst; 519 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 520 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 521 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 522 uint32_t neg_real = flip ^ neg_imag; 523 intptr_t elements = opr_sz / sizeof(float16); 524 intptr_t eltspersegment = 16 / sizeof(float16); 525 intptr_t i, j; 526 527 /* Shift boolean to the sign bit so we can xor to negate. */ 528 neg_real <<= 15; 529 neg_imag <<= 15; 530 531 for (i = 0; i < elements; i += eltspersegment) { 532 float16 mr = m[H2(i + 2 * index + 0)]; 533 float16 mi = m[H2(i + 2 * index + 1)]; 534 float16 e1 = neg_real ^ (flip ? mi : mr); 535 float16 e3 = neg_imag ^ (flip ? mr : mi); 536 537 for (j = i; j < i + eltspersegment; j += 2) { 538 float16 e2 = n[H2(j + flip)]; 539 float16 e4 = e2; 540 541 d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst); 542 d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst); 543 } 544 } 545 clear_tail(d, opr_sz, simd_maxsz(desc)); 546 } 547 548 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, 549 void *vfpst, uint32_t desc) 550 { 551 uintptr_t opr_sz = simd_oprsz(desc); 552 float32 *d = vd; 553 float32 *n = vn; 554 float32 *m = vm; 555 float_status *fpst = vfpst; 556 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 557 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 558 uint32_t neg_real = flip ^ neg_imag; 559 uintptr_t i; 560 561 /* Shift boolean to the sign bit so we can xor to negate. */ 562 neg_real <<= 31; 563 neg_imag <<= 31; 564 565 for (i = 0; i < opr_sz / 4; i += 2) { 566 float32 e2 = n[H4(i + flip)]; 567 float32 e1 = m[H4(i + flip)] ^ neg_real; 568 float32 e4 = e2; 569 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 570 571 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 572 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 573 } 574 clear_tail(d, opr_sz, simd_maxsz(desc)); 575 } 576 577 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, 578 void *vfpst, uint32_t desc) 579 { 580 uintptr_t opr_sz = simd_oprsz(desc); 581 float32 *d = vd; 582 float32 *n = vn; 583 float32 *m = vm; 584 float_status *fpst = vfpst; 585 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 586 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 587 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 588 uint32_t neg_real = flip ^ neg_imag; 589 intptr_t elements = opr_sz / sizeof(float32); 590 intptr_t eltspersegment = 16 / sizeof(float32); 591 intptr_t i, j; 592 593 /* Shift boolean to the sign bit so we can xor to negate. */ 594 neg_real <<= 31; 595 neg_imag <<= 31; 596 597 for (i = 0; i < elements; i += eltspersegment) { 598 float32 mr = m[H4(i + 2 * index + 0)]; 599 float32 mi = m[H4(i + 2 * index + 1)]; 600 float32 e1 = neg_real ^ (flip ? mi : mr); 601 float32 e3 = neg_imag ^ (flip ? mr : mi); 602 603 for (j = i; j < i + eltspersegment; j += 2) { 604 float32 e2 = n[H4(j + flip)]; 605 float32 e4 = e2; 606 607 d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst); 608 d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst); 609 } 610 } 611 clear_tail(d, opr_sz, simd_maxsz(desc)); 612 } 613 614 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, 615 void *vfpst, uint32_t desc) 616 { 617 uintptr_t opr_sz = simd_oprsz(desc); 618 float64 *d = vd; 619 float64 *n = vn; 620 float64 *m = vm; 621 float_status *fpst = vfpst; 622 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 623 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 624 uint64_t neg_real = flip ^ neg_imag; 625 uintptr_t i; 626 627 /* Shift boolean to the sign bit so we can xor to negate. */ 628 neg_real <<= 63; 629 neg_imag <<= 63; 630 631 for (i = 0; i < opr_sz / 8; i += 2) { 632 float64 e2 = n[i + flip]; 633 float64 e1 = m[i + flip] ^ neg_real; 634 float64 e4 = e2; 635 float64 e3 = m[i + 1 - flip] ^ neg_imag; 636 637 d[i] = float64_muladd(e2, e1, d[i], 0, fpst); 638 d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); 639 } 640 clear_tail(d, opr_sz, simd_maxsz(desc)); 641 } 642 643 #define DO_2OP(NAME, FUNC, TYPE) \ 644 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 645 { \ 646 intptr_t i, oprsz = simd_oprsz(desc); \ 647 TYPE *d = vd, *n = vn; \ 648 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 649 d[i] = FUNC(n[i], stat); \ 650 } \ 651 clear_tail(d, oprsz, simd_maxsz(desc)); \ 652 } 653 654 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 655 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 656 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 657 658 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 659 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 660 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 661 662 #undef DO_2OP 663 664 /* Floating-point trigonometric starting value. 665 * See the ARM ARM pseudocode function FPTrigSMul. 666 */ 667 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 668 { 669 float16 result = float16_mul(op1, op1, stat); 670 if (!float16_is_any_nan(result)) { 671 result = float16_set_sign(result, op2 & 1); 672 } 673 return result; 674 } 675 676 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 677 { 678 float32 result = float32_mul(op1, op1, stat); 679 if (!float32_is_any_nan(result)) { 680 result = float32_set_sign(result, op2 & 1); 681 } 682 return result; 683 } 684 685 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 686 { 687 float64 result = float64_mul(op1, op1, stat); 688 if (!float64_is_any_nan(result)) { 689 result = float64_set_sign(result, op2 & 1); 690 } 691 return result; 692 } 693 694 #define DO_3OP(NAME, FUNC, TYPE) \ 695 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 696 { \ 697 intptr_t i, oprsz = simd_oprsz(desc); \ 698 TYPE *d = vd, *n = vn, *m = vm; \ 699 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 700 d[i] = FUNC(n[i], m[i], stat); \ 701 } \ 702 clear_tail(d, oprsz, simd_maxsz(desc)); \ 703 } 704 705 DO_3OP(gvec_fadd_h, float16_add, float16) 706 DO_3OP(gvec_fadd_s, float32_add, float32) 707 DO_3OP(gvec_fadd_d, float64_add, float64) 708 709 DO_3OP(gvec_fsub_h, float16_sub, float16) 710 DO_3OP(gvec_fsub_s, float32_sub, float32) 711 DO_3OP(gvec_fsub_d, float64_sub, float64) 712 713 DO_3OP(gvec_fmul_h, float16_mul, float16) 714 DO_3OP(gvec_fmul_s, float32_mul, float32) 715 DO_3OP(gvec_fmul_d, float64_mul, float64) 716 717 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 718 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 719 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 720 721 #ifdef TARGET_AARCH64 722 723 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 724 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 725 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 726 727 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 728 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 729 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 730 731 #endif 732 #undef DO_3OP 733 734 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 735 * For AdvSIMD, there is of course only one such vector segment. 736 */ 737 738 #define DO_MUL_IDX(NAME, TYPE, H) \ 739 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 740 { \ 741 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 742 intptr_t idx = simd_data(desc); \ 743 TYPE *d = vd, *n = vn, *m = vm; \ 744 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 745 TYPE mm = m[H(i + idx)]; \ 746 for (j = 0; j < segment; j++) { \ 747 d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ 748 } \ 749 } \ 750 clear_tail(d, oprsz, simd_maxsz(desc)); \ 751 } 752 753 DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) 754 DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) 755 DO_MUL_IDX(gvec_fmul_idx_d, float64, ) 756 757 #undef DO_MUL_IDX 758 759 #define DO_FMLA_IDX(NAME, TYPE, H) \ 760 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 761 void *stat, uint32_t desc) \ 762 { \ 763 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 764 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 765 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 766 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 767 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 768 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 769 TYPE mm = m[H(i + idx)]; \ 770 for (j = 0; j < segment; j++) { \ 771 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 772 mm, a[i + j], 0, stat); \ 773 } \ 774 } \ 775 clear_tail(d, oprsz, simd_maxsz(desc)); \ 776 } 777 778 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 779 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 780 DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) 781 782 #undef DO_FMLA_IDX 783 784 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 785 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 786 { \ 787 intptr_t i, oprsz = simd_oprsz(desc); \ 788 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 789 bool q = false; \ 790 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 791 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 792 if (dd < MIN) { \ 793 dd = MIN; \ 794 q = true; \ 795 } else if (dd > MAX) { \ 796 dd = MAX; \ 797 q = true; \ 798 } \ 799 d[i] = dd; \ 800 } \ 801 if (q) { \ 802 uint32_t *qc = vq; \ 803 qc[0] = 1; \ 804 } \ 805 clear_tail(d, oprsz, simd_maxsz(desc)); \ 806 } 807 808 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 809 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 810 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 811 812 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 813 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 814 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 815 816 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 817 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 818 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 819 820 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 821 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 822 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 823 824 #undef DO_SAT 825 826 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 827 void *vm, uint32_t desc) 828 { 829 intptr_t i, oprsz = simd_oprsz(desc); 830 uint64_t *d = vd, *n = vn, *m = vm; 831 bool q = false; 832 833 for (i = 0; i < oprsz / 8; i++) { 834 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 835 if (dd < nn) { 836 dd = UINT64_MAX; 837 q = true; 838 } 839 d[i] = dd; 840 } 841 if (q) { 842 uint32_t *qc = vq; 843 qc[0] = 1; 844 } 845 clear_tail(d, oprsz, simd_maxsz(desc)); 846 } 847 848 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 849 void *vm, uint32_t desc) 850 { 851 intptr_t i, oprsz = simd_oprsz(desc); 852 uint64_t *d = vd, *n = vn, *m = vm; 853 bool q = false; 854 855 for (i = 0; i < oprsz / 8; i++) { 856 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 857 if (nn < mm) { 858 dd = 0; 859 q = true; 860 } 861 d[i] = dd; 862 } 863 if (q) { 864 uint32_t *qc = vq; 865 qc[0] = 1; 866 } 867 clear_tail(d, oprsz, simd_maxsz(desc)); 868 } 869 870 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 871 void *vm, uint32_t desc) 872 { 873 intptr_t i, oprsz = simd_oprsz(desc); 874 int64_t *d = vd, *n = vn, *m = vm; 875 bool q = false; 876 877 for (i = 0; i < oprsz / 8; i++) { 878 int64_t nn = n[i], mm = m[i], dd = nn + mm; 879 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 880 dd = (nn >> 63) ^ ~INT64_MIN; 881 q = true; 882 } 883 d[i] = dd; 884 } 885 if (q) { 886 uint32_t *qc = vq; 887 qc[0] = 1; 888 } 889 clear_tail(d, oprsz, simd_maxsz(desc)); 890 } 891 892 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 893 void *vm, uint32_t desc) 894 { 895 intptr_t i, oprsz = simd_oprsz(desc); 896 int64_t *d = vd, *n = vn, *m = vm; 897 bool q = false; 898 899 for (i = 0; i < oprsz / 8; i++) { 900 int64_t nn = n[i], mm = m[i], dd = nn - mm; 901 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 902 dd = (nn >> 63) ^ ~INT64_MIN; 903 q = true; 904 } 905 d[i] = dd; 906 } 907 if (q) { 908 uint32_t *qc = vq; 909 qc[0] = 1; 910 } 911 clear_tail(d, oprsz, simd_maxsz(desc)); 912 } 913 914 915 #define DO_SRA(NAME, TYPE) \ 916 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 917 { \ 918 intptr_t i, oprsz = simd_oprsz(desc); \ 919 int shift = simd_data(desc); \ 920 TYPE *d = vd, *n = vn; \ 921 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 922 d[i] += n[i] >> shift; \ 923 } \ 924 clear_tail(d, oprsz, simd_maxsz(desc)); \ 925 } 926 927 DO_SRA(gvec_ssra_b, int8_t) 928 DO_SRA(gvec_ssra_h, int16_t) 929 DO_SRA(gvec_ssra_s, int32_t) 930 DO_SRA(gvec_ssra_d, int64_t) 931 932 DO_SRA(gvec_usra_b, uint8_t) 933 DO_SRA(gvec_usra_h, uint16_t) 934 DO_SRA(gvec_usra_s, uint32_t) 935 DO_SRA(gvec_usra_d, uint64_t) 936 937 #undef DO_SRA 938 939 #define DO_RSHR(NAME, TYPE) \ 940 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 941 { \ 942 intptr_t i, oprsz = simd_oprsz(desc); \ 943 int shift = simd_data(desc); \ 944 TYPE *d = vd, *n = vn; \ 945 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 946 TYPE tmp = n[i] >> (shift - 1); \ 947 d[i] = (tmp >> 1) + (tmp & 1); \ 948 } \ 949 clear_tail(d, oprsz, simd_maxsz(desc)); \ 950 } 951 952 DO_RSHR(gvec_srshr_b, int8_t) 953 DO_RSHR(gvec_srshr_h, int16_t) 954 DO_RSHR(gvec_srshr_s, int32_t) 955 DO_RSHR(gvec_srshr_d, int64_t) 956 957 DO_RSHR(gvec_urshr_b, uint8_t) 958 DO_RSHR(gvec_urshr_h, uint16_t) 959 DO_RSHR(gvec_urshr_s, uint32_t) 960 DO_RSHR(gvec_urshr_d, uint64_t) 961 962 #undef DO_RSHR 963 964 #define DO_RSRA(NAME, TYPE) \ 965 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 966 { \ 967 intptr_t i, oprsz = simd_oprsz(desc); \ 968 int shift = simd_data(desc); \ 969 TYPE *d = vd, *n = vn; \ 970 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 971 TYPE tmp = n[i] >> (shift - 1); \ 972 d[i] += (tmp >> 1) + (tmp & 1); \ 973 } \ 974 clear_tail(d, oprsz, simd_maxsz(desc)); \ 975 } 976 977 DO_RSRA(gvec_srsra_b, int8_t) 978 DO_RSRA(gvec_srsra_h, int16_t) 979 DO_RSRA(gvec_srsra_s, int32_t) 980 DO_RSRA(gvec_srsra_d, int64_t) 981 982 DO_RSRA(gvec_ursra_b, uint8_t) 983 DO_RSRA(gvec_ursra_h, uint16_t) 984 DO_RSRA(gvec_ursra_s, uint32_t) 985 DO_RSRA(gvec_ursra_d, uint64_t) 986 987 #undef DO_RSRA 988 989 #define DO_SRI(NAME, TYPE) \ 990 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 991 { \ 992 intptr_t i, oprsz = simd_oprsz(desc); \ 993 int shift = simd_data(desc); \ 994 TYPE *d = vd, *n = vn; \ 995 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 996 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 997 } \ 998 clear_tail(d, oprsz, simd_maxsz(desc)); \ 999 } 1000 1001 DO_SRI(gvec_sri_b, uint8_t) 1002 DO_SRI(gvec_sri_h, uint16_t) 1003 DO_SRI(gvec_sri_s, uint32_t) 1004 DO_SRI(gvec_sri_d, uint64_t) 1005 1006 #undef DO_SRI 1007 1008 #define DO_SLI(NAME, TYPE) \ 1009 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1010 { \ 1011 intptr_t i, oprsz = simd_oprsz(desc); \ 1012 int shift = simd_data(desc); \ 1013 TYPE *d = vd, *n = vn; \ 1014 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1015 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1016 } \ 1017 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1018 } 1019 1020 DO_SLI(gvec_sli_b, uint8_t) 1021 DO_SLI(gvec_sli_h, uint16_t) 1022 DO_SLI(gvec_sli_s, uint32_t) 1023 DO_SLI(gvec_sli_d, uint64_t) 1024 1025 #undef DO_SLI 1026 1027 /* 1028 * Convert float16 to float32, raising no exceptions and 1029 * preserving exceptional values, including SNaN. 1030 * This is effectively an unpack+repack operation. 1031 */ 1032 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1033 { 1034 const int f16_bias = 15; 1035 const int f32_bias = 127; 1036 uint32_t sign = extract32(f16, 15, 1); 1037 uint32_t exp = extract32(f16, 10, 5); 1038 uint32_t frac = extract32(f16, 0, 10); 1039 1040 if (exp == 0x1f) { 1041 /* Inf or NaN */ 1042 exp = 0xff; 1043 } else if (exp == 0) { 1044 /* Zero or denormal. */ 1045 if (frac != 0) { 1046 if (fz16) { 1047 frac = 0; 1048 } else { 1049 /* 1050 * Denormal; these are all normal float32. 1051 * Shift the fraction so that the msb is at bit 11, 1052 * then remove bit 11 as the implicit bit of the 1053 * normalized float32. Note that we still go through 1054 * the shift for normal numbers below, to put the 1055 * float32 fraction at the right place. 1056 */ 1057 int shift = clz32(frac) - 21; 1058 frac = (frac << shift) & 0x3ff; 1059 exp = f32_bias - f16_bias - shift + 1; 1060 } 1061 } 1062 } else { 1063 /* Normal number; adjust the bias. */ 1064 exp += f32_bias - f16_bias; 1065 } 1066 sign <<= 31; 1067 exp <<= 23; 1068 frac <<= 23 - 10; 1069 1070 return sign | exp | frac; 1071 } 1072 1073 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1074 { 1075 /* 1076 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1077 * Load the 2nd qword iff is_q & is_2. 1078 * Shift to the 2nd dword iff !is_q & is_2. 1079 * For !is_q & !is_2, the upper bits of the result are garbage. 1080 */ 1081 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1082 } 1083 1084 /* 1085 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1086 * as there is not yet SVE versions that might use blocking. 1087 */ 1088 1089 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1090 uint32_t desc, bool fz16) 1091 { 1092 intptr_t i, oprsz = simd_oprsz(desc); 1093 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1094 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1095 int is_q = oprsz == 16; 1096 uint64_t n_4, m_4; 1097 1098 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1099 n_4 = load4_f16(vn, is_q, is_2); 1100 m_4 = load4_f16(vm, is_q, is_2); 1101 1102 /* Negate all inputs for FMLSL at once. */ 1103 if (is_s) { 1104 n_4 ^= 0x8000800080008000ull; 1105 } 1106 1107 for (i = 0; i < oprsz / 4; i++) { 1108 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1109 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1110 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1111 } 1112 clear_tail(d, oprsz, simd_maxsz(desc)); 1113 } 1114 1115 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1116 void *venv, uint32_t desc) 1117 { 1118 CPUARMState *env = venv; 1119 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1120 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1121 } 1122 1123 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1124 void *venv, uint32_t desc) 1125 { 1126 CPUARMState *env = venv; 1127 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1128 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1129 } 1130 1131 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1132 uint32_t desc, bool fz16) 1133 { 1134 intptr_t i, oprsz = simd_oprsz(desc); 1135 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1136 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1137 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1138 int is_q = oprsz == 16; 1139 uint64_t n_4; 1140 float32 m_1; 1141 1142 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1143 n_4 = load4_f16(vn, is_q, is_2); 1144 1145 /* Negate all inputs for FMLSL at once. */ 1146 if (is_s) { 1147 n_4 ^= 0x8000800080008000ull; 1148 } 1149 1150 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1151 1152 for (i = 0; i < oprsz / 4; i++) { 1153 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1154 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1155 } 1156 clear_tail(d, oprsz, simd_maxsz(desc)); 1157 } 1158 1159 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1160 void *venv, uint32_t desc) 1161 { 1162 CPUARMState *env = venv; 1163 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1164 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1165 } 1166 1167 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1168 void *venv, uint32_t desc) 1169 { 1170 CPUARMState *env = venv; 1171 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1172 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1173 } 1174 1175 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1176 { 1177 intptr_t i, opr_sz = simd_oprsz(desc); 1178 int8_t *d = vd, *n = vn, *m = vm; 1179 1180 for (i = 0; i < opr_sz; ++i) { 1181 int8_t mm = m[i]; 1182 int8_t nn = n[i]; 1183 int8_t res = 0; 1184 if (mm >= 0) { 1185 if (mm < 8) { 1186 res = nn << mm; 1187 } 1188 } else { 1189 res = nn >> (mm > -8 ? -mm : 7); 1190 } 1191 d[i] = res; 1192 } 1193 clear_tail(d, opr_sz, simd_maxsz(desc)); 1194 } 1195 1196 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1197 { 1198 intptr_t i, opr_sz = simd_oprsz(desc); 1199 int16_t *d = vd, *n = vn, *m = vm; 1200 1201 for (i = 0; i < opr_sz / 2; ++i) { 1202 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1203 int16_t nn = n[i]; 1204 int16_t res = 0; 1205 if (mm >= 0) { 1206 if (mm < 16) { 1207 res = nn << mm; 1208 } 1209 } else { 1210 res = nn >> (mm > -16 ? -mm : 15); 1211 } 1212 d[i] = res; 1213 } 1214 clear_tail(d, opr_sz, simd_maxsz(desc)); 1215 } 1216 1217 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1218 { 1219 intptr_t i, opr_sz = simd_oprsz(desc); 1220 uint8_t *d = vd, *n = vn, *m = vm; 1221 1222 for (i = 0; i < opr_sz; ++i) { 1223 int8_t mm = m[i]; 1224 uint8_t nn = n[i]; 1225 uint8_t res = 0; 1226 if (mm >= 0) { 1227 if (mm < 8) { 1228 res = nn << mm; 1229 } 1230 } else { 1231 if (mm > -8) { 1232 res = nn >> -mm; 1233 } 1234 } 1235 d[i] = res; 1236 } 1237 clear_tail(d, opr_sz, simd_maxsz(desc)); 1238 } 1239 1240 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1241 { 1242 intptr_t i, opr_sz = simd_oprsz(desc); 1243 uint16_t *d = vd, *n = vn, *m = vm; 1244 1245 for (i = 0; i < opr_sz / 2; ++i) { 1246 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1247 uint16_t nn = n[i]; 1248 uint16_t res = 0; 1249 if (mm >= 0) { 1250 if (mm < 16) { 1251 res = nn << mm; 1252 } 1253 } else { 1254 if (mm > -16) { 1255 res = nn >> -mm; 1256 } 1257 } 1258 d[i] = res; 1259 } 1260 clear_tail(d, opr_sz, simd_maxsz(desc)); 1261 } 1262 1263 /* 1264 * 8x8->8 polynomial multiply. 1265 * 1266 * Polynomial multiplication is like integer multiplication except the 1267 * partial products are XORed, not added. 1268 * 1269 * TODO: expose this as a generic vector operation, as it is a common 1270 * crypto building block. 1271 */ 1272 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 1273 { 1274 intptr_t i, j, opr_sz = simd_oprsz(desc); 1275 uint64_t *d = vd, *n = vn, *m = vm; 1276 1277 for (i = 0; i < opr_sz / 8; ++i) { 1278 uint64_t nn = n[i]; 1279 uint64_t mm = m[i]; 1280 uint64_t rr = 0; 1281 1282 for (j = 0; j < 8; ++j) { 1283 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; 1284 rr ^= mm & mask; 1285 mm = (mm << 1) & 0xfefefefefefefefeull; 1286 nn >>= 1; 1287 } 1288 d[i] = rr; 1289 } 1290 clear_tail(d, opr_sz, simd_maxsz(desc)); 1291 } 1292 1293 /* 1294 * 64x64->128 polynomial multiply. 1295 * Because of the lanes are not accessed in strict columns, 1296 * this probably cannot be turned into a generic helper. 1297 */ 1298 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 1299 { 1300 intptr_t i, j, opr_sz = simd_oprsz(desc); 1301 intptr_t hi = simd_data(desc); 1302 uint64_t *d = vd, *n = vn, *m = vm; 1303 1304 for (i = 0; i < opr_sz / 8; i += 2) { 1305 uint64_t nn = n[i + hi]; 1306 uint64_t mm = m[i + hi]; 1307 uint64_t rhi = 0; 1308 uint64_t rlo = 0; 1309 1310 /* Bit 0 can only influence the low 64-bit result. */ 1311 if (nn & 1) { 1312 rlo = mm; 1313 } 1314 1315 for (j = 1; j < 64; ++j) { 1316 uint64_t mask = -((nn >> j) & 1); 1317 rlo ^= (mm << j) & mask; 1318 rhi ^= (mm >> (64 - j)) & mask; 1319 } 1320 d[i] = rlo; 1321 d[i + 1] = rhi; 1322 } 1323 clear_tail(d, opr_sz, simd_maxsz(desc)); 1324 } 1325 1326 /* 1327 * 8x8->16 polynomial multiply. 1328 * 1329 * The byte inputs are expanded to (or extracted from) half-words. 1330 * Note that neon and sve2 get the inputs from different positions. 1331 * This allows 4 bytes to be processed in parallel with uint64_t. 1332 */ 1333 1334 static uint64_t expand_byte_to_half(uint64_t x) 1335 { 1336 return (x & 0x000000ff) 1337 | ((x & 0x0000ff00) << 8) 1338 | ((x & 0x00ff0000) << 16) 1339 | ((x & 0xff000000) << 24); 1340 } 1341 1342 static uint64_t pmull_h(uint64_t op1, uint64_t op2) 1343 { 1344 uint64_t result = 0; 1345 int i; 1346 1347 for (i = 0; i < 8; ++i) { 1348 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; 1349 result ^= op2 & mask; 1350 op1 >>= 1; 1351 op2 <<= 1; 1352 } 1353 return result; 1354 } 1355 1356 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 1357 { 1358 int hi = simd_data(desc); 1359 uint64_t *d = vd, *n = vn, *m = vm; 1360 uint64_t nn = n[hi], mm = m[hi]; 1361 1362 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); 1363 nn >>= 32; 1364 mm >>= 32; 1365 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); 1366 1367 clear_tail(d, 16, simd_maxsz(desc)); 1368 } 1369 1370 #ifdef TARGET_AARCH64 1371 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 1372 { 1373 int shift = simd_data(desc) * 8; 1374 intptr_t i, opr_sz = simd_oprsz(desc); 1375 uint64_t *d = vd, *n = vn, *m = vm; 1376 1377 for (i = 0; i < opr_sz / 8; ++i) { 1378 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; 1379 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; 1380 1381 d[i] = pmull_h(nn, mm); 1382 } 1383 } 1384 #endif 1385 1386 #define DO_CMP0(NAME, TYPE, OP) \ 1387 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1388 { \ 1389 intptr_t i, opr_sz = simd_oprsz(desc); \ 1390 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1391 TYPE nn = *(TYPE *)(vn + i); \ 1392 *(TYPE *)(vd + i) = -(nn OP 0); \ 1393 } \ 1394 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 1395 } 1396 1397 DO_CMP0(gvec_ceq0_b, int8_t, ==) 1398 DO_CMP0(gvec_clt0_b, int8_t, <) 1399 DO_CMP0(gvec_cle0_b, int8_t, <=) 1400 DO_CMP0(gvec_cgt0_b, int8_t, >) 1401 DO_CMP0(gvec_cge0_b, int8_t, >=) 1402 1403 DO_CMP0(gvec_ceq0_h, int16_t, ==) 1404 DO_CMP0(gvec_clt0_h, int16_t, <) 1405 DO_CMP0(gvec_cle0_h, int16_t, <=) 1406 DO_CMP0(gvec_cgt0_h, int16_t, >) 1407 DO_CMP0(gvec_cge0_h, int16_t, >=) 1408 1409 #undef DO_CMP0 1410