1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 26 27 /* Note that vector data is stored in host-endian 64-bit chunks, 28 so addressing units smaller than that needs a host-endian fixup. */ 29 #ifdef HOST_WORDS_BIGENDIAN 30 #define H1(x) ((x) ^ 7) 31 #define H2(x) ((x) ^ 3) 32 #define H4(x) ((x) ^ 1) 33 #else 34 #define H1(x) (x) 35 #define H2(x) (x) 36 #define H4(x) (x) 37 #endif 38 39 static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 40 { 41 uint64_t *d = vd + opr_sz; 42 uintptr_t i; 43 44 for (i = opr_sz; i < max_sz; i += 8) { 45 *d++ = 0; 46 } 47 } 48 49 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 50 static int16_t inl_qrdmlah_s16(int16_t src1, int16_t src2, 51 int16_t src3, uint32_t *sat) 52 { 53 /* Simplify: 54 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 55 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 56 */ 57 int32_t ret = (int32_t)src1 * src2; 58 ret = ((int32_t)src3 << 15) + ret + (1 << 14); 59 ret >>= 15; 60 if (ret != (int16_t)ret) { 61 *sat = 1; 62 ret = (ret < 0 ? -0x8000 : 0x7fff); 63 } 64 return ret; 65 } 66 67 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 68 uint32_t src2, uint32_t src3) 69 { 70 uint32_t *sat = &env->vfp.qc[0]; 71 uint16_t e1 = inl_qrdmlah_s16(src1, src2, src3, sat); 72 uint16_t e2 = inl_qrdmlah_s16(src1 >> 16, src2 >> 16, src3 >> 16, sat); 73 return deposit32(e1, 16, 16, e2); 74 } 75 76 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 77 void *vq, uint32_t desc) 78 { 79 uintptr_t opr_sz = simd_oprsz(desc); 80 int16_t *d = vd; 81 int16_t *n = vn; 82 int16_t *m = vm; 83 uintptr_t i; 84 85 for (i = 0; i < opr_sz / 2; ++i) { 86 d[i] = inl_qrdmlah_s16(n[i], m[i], d[i], vq); 87 } 88 clear_tail(d, opr_sz, simd_maxsz(desc)); 89 } 90 91 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ 92 static int16_t inl_qrdmlsh_s16(int16_t src1, int16_t src2, 93 int16_t src3, uint32_t *sat) 94 { 95 /* Similarly, using subtraction: 96 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 97 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 98 */ 99 int32_t ret = (int32_t)src1 * src2; 100 ret = ((int32_t)src3 << 15) - ret + (1 << 14); 101 ret >>= 15; 102 if (ret != (int16_t)ret) { 103 *sat = 1; 104 ret = (ret < 0 ? -0x8000 : 0x7fff); 105 } 106 return ret; 107 } 108 109 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 110 uint32_t src2, uint32_t src3) 111 { 112 uint32_t *sat = &env->vfp.qc[0]; 113 uint16_t e1 = inl_qrdmlsh_s16(src1, src2, src3, sat); 114 uint16_t e2 = inl_qrdmlsh_s16(src1 >> 16, src2 >> 16, src3 >> 16, sat); 115 return deposit32(e1, 16, 16, e2); 116 } 117 118 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 119 void *vq, uint32_t desc) 120 { 121 uintptr_t opr_sz = simd_oprsz(desc); 122 int16_t *d = vd; 123 int16_t *n = vn; 124 int16_t *m = vm; 125 uintptr_t i; 126 127 for (i = 0; i < opr_sz / 2; ++i) { 128 d[i] = inl_qrdmlsh_s16(n[i], m[i], d[i], vq); 129 } 130 clear_tail(d, opr_sz, simd_maxsz(desc)); 131 } 132 133 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 134 static int32_t inl_qrdmlah_s32(int32_t src1, int32_t src2, 135 int32_t src3, uint32_t *sat) 136 { 137 /* Simplify similarly to int_qrdmlah_s16 above. */ 138 int64_t ret = (int64_t)src1 * src2; 139 ret = ((int64_t)src3 << 31) + ret + (1 << 30); 140 ret >>= 31; 141 if (ret != (int32_t)ret) { 142 *sat = 1; 143 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 144 } 145 return ret; 146 } 147 148 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 149 int32_t src2, int32_t src3) 150 { 151 uint32_t *sat = &env->vfp.qc[0]; 152 return inl_qrdmlah_s32(src1, src2, src3, sat); 153 } 154 155 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 156 void *vq, uint32_t desc) 157 { 158 uintptr_t opr_sz = simd_oprsz(desc); 159 int32_t *d = vd; 160 int32_t *n = vn; 161 int32_t *m = vm; 162 uintptr_t i; 163 164 for (i = 0; i < opr_sz / 4; ++i) { 165 d[i] = inl_qrdmlah_s32(n[i], m[i], d[i], vq); 166 } 167 clear_tail(d, opr_sz, simd_maxsz(desc)); 168 } 169 170 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ 171 static int32_t inl_qrdmlsh_s32(int32_t src1, int32_t src2, 172 int32_t src3, uint32_t *sat) 173 { 174 /* Simplify similarly to int_qrdmlsh_s16 above. */ 175 int64_t ret = (int64_t)src1 * src2; 176 ret = ((int64_t)src3 << 31) - ret + (1 << 30); 177 ret >>= 31; 178 if (ret != (int32_t)ret) { 179 *sat = 1; 180 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 181 } 182 return ret; 183 } 184 185 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 186 int32_t src2, int32_t src3) 187 { 188 uint32_t *sat = &env->vfp.qc[0]; 189 return inl_qrdmlsh_s32(src1, src2, src3, sat); 190 } 191 192 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 193 void *vq, uint32_t desc) 194 { 195 uintptr_t opr_sz = simd_oprsz(desc); 196 int32_t *d = vd; 197 int32_t *n = vn; 198 int32_t *m = vm; 199 uintptr_t i; 200 201 for (i = 0; i < opr_sz / 4; ++i) { 202 d[i] = inl_qrdmlsh_s32(n[i], m[i], d[i], vq); 203 } 204 clear_tail(d, opr_sz, simd_maxsz(desc)); 205 } 206 207 /* Integer 8 and 16-bit dot-product. 208 * 209 * Note that for the loops herein, host endianness does not matter 210 * with respect to the ordering of data within the 64-bit lanes. 211 * All elements are treated equally, no matter where they are. 212 */ 213 214 void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) 215 { 216 intptr_t i, opr_sz = simd_oprsz(desc); 217 uint32_t *d = vd; 218 int8_t *n = vn, *m = vm; 219 220 for (i = 0; i < opr_sz / 4; ++i) { 221 d[i] += n[i * 4 + 0] * m[i * 4 + 0] 222 + n[i * 4 + 1] * m[i * 4 + 1] 223 + n[i * 4 + 2] * m[i * 4 + 2] 224 + n[i * 4 + 3] * m[i * 4 + 3]; 225 } 226 clear_tail(d, opr_sz, simd_maxsz(desc)); 227 } 228 229 void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) 230 { 231 intptr_t i, opr_sz = simd_oprsz(desc); 232 uint32_t *d = vd; 233 uint8_t *n = vn, *m = vm; 234 235 for (i = 0; i < opr_sz / 4; ++i) { 236 d[i] += n[i * 4 + 0] * m[i * 4 + 0] 237 + n[i * 4 + 1] * m[i * 4 + 1] 238 + n[i * 4 + 2] * m[i * 4 + 2] 239 + n[i * 4 + 3] * m[i * 4 + 3]; 240 } 241 clear_tail(d, opr_sz, simd_maxsz(desc)); 242 } 243 244 void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) 245 { 246 intptr_t i, opr_sz = simd_oprsz(desc); 247 uint64_t *d = vd; 248 int16_t *n = vn, *m = vm; 249 250 for (i = 0; i < opr_sz / 8; ++i) { 251 d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] 252 + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] 253 + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] 254 + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; 255 } 256 clear_tail(d, opr_sz, simd_maxsz(desc)); 257 } 258 259 void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) 260 { 261 intptr_t i, opr_sz = simd_oprsz(desc); 262 uint64_t *d = vd; 263 uint16_t *n = vn, *m = vm; 264 265 for (i = 0; i < opr_sz / 8; ++i) { 266 d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] 267 + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] 268 + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] 269 + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; 270 } 271 clear_tail(d, opr_sz, simd_maxsz(desc)); 272 } 273 274 void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) 275 { 276 intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; 277 intptr_t index = simd_data(desc); 278 uint32_t *d = vd; 279 int8_t *n = vn; 280 int8_t *m_indexed = (int8_t *)vm + index * 4; 281 282 /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. 283 * Otherwise opr_sz is a multiple of 16. 284 */ 285 segend = MIN(4, opr_sz_4); 286 i = 0; 287 do { 288 int8_t m0 = m_indexed[i * 4 + 0]; 289 int8_t m1 = m_indexed[i * 4 + 1]; 290 int8_t m2 = m_indexed[i * 4 + 2]; 291 int8_t m3 = m_indexed[i * 4 + 3]; 292 293 do { 294 d[i] += n[i * 4 + 0] * m0 295 + n[i * 4 + 1] * m1 296 + n[i * 4 + 2] * m2 297 + n[i * 4 + 3] * m3; 298 } while (++i < segend); 299 segend = i + 4; 300 } while (i < opr_sz_4); 301 302 clear_tail(d, opr_sz, simd_maxsz(desc)); 303 } 304 305 void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) 306 { 307 intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; 308 intptr_t index = simd_data(desc); 309 uint32_t *d = vd; 310 uint8_t *n = vn; 311 uint8_t *m_indexed = (uint8_t *)vm + index * 4; 312 313 /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. 314 * Otherwise opr_sz is a multiple of 16. 315 */ 316 segend = MIN(4, opr_sz_4); 317 i = 0; 318 do { 319 uint8_t m0 = m_indexed[i * 4 + 0]; 320 uint8_t m1 = m_indexed[i * 4 + 1]; 321 uint8_t m2 = m_indexed[i * 4 + 2]; 322 uint8_t m3 = m_indexed[i * 4 + 3]; 323 324 do { 325 d[i] += n[i * 4 + 0] * m0 326 + n[i * 4 + 1] * m1 327 + n[i * 4 + 2] * m2 328 + n[i * 4 + 3] * m3; 329 } while (++i < segend); 330 segend = i + 4; 331 } while (i < opr_sz_4); 332 333 clear_tail(d, opr_sz, simd_maxsz(desc)); 334 } 335 336 void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 337 { 338 intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; 339 intptr_t index = simd_data(desc); 340 uint64_t *d = vd; 341 int16_t *n = vn; 342 int16_t *m_indexed = (int16_t *)vm + index * 4; 343 344 /* This is supported by SVE only, so opr_sz is always a multiple of 16. 345 * Process the entire segment all at once, writing back the results 346 * only after we've consumed all of the inputs. 347 */ 348 for (i = 0; i < opr_sz_8 ; i += 2) { 349 uint64_t d0, d1; 350 351 d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; 352 d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; 353 d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; 354 d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; 355 d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; 356 d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; 357 d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; 358 d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; 359 360 d[i + 0] += d0; 361 d[i + 1] += d1; 362 } 363 364 clear_tail(d, opr_sz, simd_maxsz(desc)); 365 } 366 367 void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 368 { 369 intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; 370 intptr_t index = simd_data(desc); 371 uint64_t *d = vd; 372 uint16_t *n = vn; 373 uint16_t *m_indexed = (uint16_t *)vm + index * 4; 374 375 /* This is supported by SVE only, so opr_sz is always a multiple of 16. 376 * Process the entire segment all at once, writing back the results 377 * only after we've consumed all of the inputs. 378 */ 379 for (i = 0; i < opr_sz_8 ; i += 2) { 380 uint64_t d0, d1; 381 382 d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; 383 d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; 384 d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; 385 d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; 386 d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; 387 d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; 388 d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; 389 d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; 390 391 d[i + 0] += d0; 392 d[i + 1] += d1; 393 } 394 395 clear_tail(d, opr_sz, simd_maxsz(desc)); 396 } 397 398 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 399 void *vfpst, uint32_t desc) 400 { 401 uintptr_t opr_sz = simd_oprsz(desc); 402 float16 *d = vd; 403 float16 *n = vn; 404 float16 *m = vm; 405 float_status *fpst = vfpst; 406 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 407 uint32_t neg_imag = neg_real ^ 1; 408 uintptr_t i; 409 410 /* Shift boolean to the sign bit so we can xor to negate. */ 411 neg_real <<= 15; 412 neg_imag <<= 15; 413 414 for (i = 0; i < opr_sz / 2; i += 2) { 415 float16 e0 = n[H2(i)]; 416 float16 e1 = m[H2(i + 1)] ^ neg_imag; 417 float16 e2 = n[H2(i + 1)]; 418 float16 e3 = m[H2(i)] ^ neg_real; 419 420 d[H2(i)] = float16_add(e0, e1, fpst); 421 d[H2(i + 1)] = float16_add(e2, e3, fpst); 422 } 423 clear_tail(d, opr_sz, simd_maxsz(desc)); 424 } 425 426 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 427 void *vfpst, uint32_t desc) 428 { 429 uintptr_t opr_sz = simd_oprsz(desc); 430 float32 *d = vd; 431 float32 *n = vn; 432 float32 *m = vm; 433 float_status *fpst = vfpst; 434 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 435 uint32_t neg_imag = neg_real ^ 1; 436 uintptr_t i; 437 438 /* Shift boolean to the sign bit so we can xor to negate. */ 439 neg_real <<= 31; 440 neg_imag <<= 31; 441 442 for (i = 0; i < opr_sz / 4; i += 2) { 443 float32 e0 = n[H4(i)]; 444 float32 e1 = m[H4(i + 1)] ^ neg_imag; 445 float32 e2 = n[H4(i + 1)]; 446 float32 e3 = m[H4(i)] ^ neg_real; 447 448 d[H4(i)] = float32_add(e0, e1, fpst); 449 d[H4(i + 1)] = float32_add(e2, e3, fpst); 450 } 451 clear_tail(d, opr_sz, simd_maxsz(desc)); 452 } 453 454 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 455 void *vfpst, uint32_t desc) 456 { 457 uintptr_t opr_sz = simd_oprsz(desc); 458 float64 *d = vd; 459 float64 *n = vn; 460 float64 *m = vm; 461 float_status *fpst = vfpst; 462 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 463 uint64_t neg_imag = neg_real ^ 1; 464 uintptr_t i; 465 466 /* Shift boolean to the sign bit so we can xor to negate. */ 467 neg_real <<= 63; 468 neg_imag <<= 63; 469 470 for (i = 0; i < opr_sz / 8; i += 2) { 471 float64 e0 = n[i]; 472 float64 e1 = m[i + 1] ^ neg_imag; 473 float64 e2 = n[i + 1]; 474 float64 e3 = m[i] ^ neg_real; 475 476 d[i] = float64_add(e0, e1, fpst); 477 d[i + 1] = float64_add(e2, e3, fpst); 478 } 479 clear_tail(d, opr_sz, simd_maxsz(desc)); 480 } 481 482 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, 483 void *vfpst, uint32_t desc) 484 { 485 uintptr_t opr_sz = simd_oprsz(desc); 486 float16 *d = vd; 487 float16 *n = vn; 488 float16 *m = vm; 489 float_status *fpst = vfpst; 490 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 491 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 492 uint32_t neg_real = flip ^ neg_imag; 493 uintptr_t i; 494 495 /* Shift boolean to the sign bit so we can xor to negate. */ 496 neg_real <<= 15; 497 neg_imag <<= 15; 498 499 for (i = 0; i < opr_sz / 2; i += 2) { 500 float16 e2 = n[H2(i + flip)]; 501 float16 e1 = m[H2(i + flip)] ^ neg_real; 502 float16 e4 = e2; 503 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 504 505 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 506 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 507 } 508 clear_tail(d, opr_sz, simd_maxsz(desc)); 509 } 510 511 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, 512 void *vfpst, uint32_t desc) 513 { 514 uintptr_t opr_sz = simd_oprsz(desc); 515 float16 *d = vd; 516 float16 *n = vn; 517 float16 *m = vm; 518 float_status *fpst = vfpst; 519 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 520 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 521 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 522 uint32_t neg_real = flip ^ neg_imag; 523 intptr_t elements = opr_sz / sizeof(float16); 524 intptr_t eltspersegment = 16 / sizeof(float16); 525 intptr_t i, j; 526 527 /* Shift boolean to the sign bit so we can xor to negate. */ 528 neg_real <<= 15; 529 neg_imag <<= 15; 530 531 for (i = 0; i < elements; i += eltspersegment) { 532 float16 mr = m[H2(i + 2 * index + 0)]; 533 float16 mi = m[H2(i + 2 * index + 1)]; 534 float16 e1 = neg_real ^ (flip ? mi : mr); 535 float16 e3 = neg_imag ^ (flip ? mr : mi); 536 537 for (j = i; j < i + eltspersegment; j += 2) { 538 float16 e2 = n[H2(j + flip)]; 539 float16 e4 = e2; 540 541 d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst); 542 d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst); 543 } 544 } 545 clear_tail(d, opr_sz, simd_maxsz(desc)); 546 } 547 548 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, 549 void *vfpst, uint32_t desc) 550 { 551 uintptr_t opr_sz = simd_oprsz(desc); 552 float32 *d = vd; 553 float32 *n = vn; 554 float32 *m = vm; 555 float_status *fpst = vfpst; 556 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 557 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 558 uint32_t neg_real = flip ^ neg_imag; 559 uintptr_t i; 560 561 /* Shift boolean to the sign bit so we can xor to negate. */ 562 neg_real <<= 31; 563 neg_imag <<= 31; 564 565 for (i = 0; i < opr_sz / 4; i += 2) { 566 float32 e2 = n[H4(i + flip)]; 567 float32 e1 = m[H4(i + flip)] ^ neg_real; 568 float32 e4 = e2; 569 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 570 571 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 572 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 573 } 574 clear_tail(d, opr_sz, simd_maxsz(desc)); 575 } 576 577 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, 578 void *vfpst, uint32_t desc) 579 { 580 uintptr_t opr_sz = simd_oprsz(desc); 581 float32 *d = vd; 582 float32 *n = vn; 583 float32 *m = vm; 584 float_status *fpst = vfpst; 585 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 586 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 587 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 588 uint32_t neg_real = flip ^ neg_imag; 589 intptr_t elements = opr_sz / sizeof(float32); 590 intptr_t eltspersegment = 16 / sizeof(float32); 591 intptr_t i, j; 592 593 /* Shift boolean to the sign bit so we can xor to negate. */ 594 neg_real <<= 31; 595 neg_imag <<= 31; 596 597 for (i = 0; i < elements; i += eltspersegment) { 598 float32 mr = m[H4(i + 2 * index + 0)]; 599 float32 mi = m[H4(i + 2 * index + 1)]; 600 float32 e1 = neg_real ^ (flip ? mi : mr); 601 float32 e3 = neg_imag ^ (flip ? mr : mi); 602 603 for (j = i; j < i + eltspersegment; j += 2) { 604 float32 e2 = n[H4(j + flip)]; 605 float32 e4 = e2; 606 607 d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst); 608 d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst); 609 } 610 } 611 clear_tail(d, opr_sz, simd_maxsz(desc)); 612 } 613 614 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, 615 void *vfpst, uint32_t desc) 616 { 617 uintptr_t opr_sz = simd_oprsz(desc); 618 float64 *d = vd; 619 float64 *n = vn; 620 float64 *m = vm; 621 float_status *fpst = vfpst; 622 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 623 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 624 uint64_t neg_real = flip ^ neg_imag; 625 uintptr_t i; 626 627 /* Shift boolean to the sign bit so we can xor to negate. */ 628 neg_real <<= 63; 629 neg_imag <<= 63; 630 631 for (i = 0; i < opr_sz / 8; i += 2) { 632 float64 e2 = n[i + flip]; 633 float64 e1 = m[i + flip] ^ neg_real; 634 float64 e4 = e2; 635 float64 e3 = m[i + 1 - flip] ^ neg_imag; 636 637 d[i] = float64_muladd(e2, e1, d[i], 0, fpst); 638 d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); 639 } 640 clear_tail(d, opr_sz, simd_maxsz(desc)); 641 } 642 643 #define DO_2OP(NAME, FUNC, TYPE) \ 644 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 645 { \ 646 intptr_t i, oprsz = simd_oprsz(desc); \ 647 TYPE *d = vd, *n = vn; \ 648 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 649 d[i] = FUNC(n[i], stat); \ 650 } \ 651 clear_tail(d, oprsz, simd_maxsz(desc)); \ 652 } 653 654 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 655 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 656 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 657 658 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 659 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 660 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 661 662 #undef DO_2OP 663 664 /* Floating-point trigonometric starting value. 665 * See the ARM ARM pseudocode function FPTrigSMul. 666 */ 667 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 668 { 669 float16 result = float16_mul(op1, op1, stat); 670 if (!float16_is_any_nan(result)) { 671 result = float16_set_sign(result, op2 & 1); 672 } 673 return result; 674 } 675 676 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 677 { 678 float32 result = float32_mul(op1, op1, stat); 679 if (!float32_is_any_nan(result)) { 680 result = float32_set_sign(result, op2 & 1); 681 } 682 return result; 683 } 684 685 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 686 { 687 float64 result = float64_mul(op1, op1, stat); 688 if (!float64_is_any_nan(result)) { 689 result = float64_set_sign(result, op2 & 1); 690 } 691 return result; 692 } 693 694 #define DO_3OP(NAME, FUNC, TYPE) \ 695 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 696 { \ 697 intptr_t i, oprsz = simd_oprsz(desc); \ 698 TYPE *d = vd, *n = vn, *m = vm; \ 699 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 700 d[i] = FUNC(n[i], m[i], stat); \ 701 } \ 702 clear_tail(d, oprsz, simd_maxsz(desc)); \ 703 } 704 705 DO_3OP(gvec_fadd_h, float16_add, float16) 706 DO_3OP(gvec_fadd_s, float32_add, float32) 707 DO_3OP(gvec_fadd_d, float64_add, float64) 708 709 DO_3OP(gvec_fsub_h, float16_sub, float16) 710 DO_3OP(gvec_fsub_s, float32_sub, float32) 711 DO_3OP(gvec_fsub_d, float64_sub, float64) 712 713 DO_3OP(gvec_fmul_h, float16_mul, float16) 714 DO_3OP(gvec_fmul_s, float32_mul, float32) 715 DO_3OP(gvec_fmul_d, float64_mul, float64) 716 717 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 718 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 719 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 720 721 #ifdef TARGET_AARCH64 722 723 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 724 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 725 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 726 727 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 728 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 729 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 730 731 #endif 732 #undef DO_3OP 733 734 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 735 * For AdvSIMD, there is of course only one such vector segment. 736 */ 737 738 #define DO_MUL_IDX(NAME, TYPE, H) \ 739 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 740 { \ 741 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 742 intptr_t idx = simd_data(desc); \ 743 TYPE *d = vd, *n = vn, *m = vm; \ 744 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 745 TYPE mm = m[H(i + idx)]; \ 746 for (j = 0; j < segment; j++) { \ 747 d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ 748 } \ 749 } \ 750 } 751 752 DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) 753 DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) 754 DO_MUL_IDX(gvec_fmul_idx_d, float64, ) 755 756 #undef DO_MUL_IDX 757 758 #define DO_FMLA_IDX(NAME, TYPE, H) \ 759 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 760 void *stat, uint32_t desc) \ 761 { \ 762 intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 763 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 764 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 765 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 766 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 767 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 768 TYPE mm = m[H(i + idx)]; \ 769 for (j = 0; j < segment; j++) { \ 770 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 771 mm, a[i + j], 0, stat); \ 772 } \ 773 } \ 774 } 775 776 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 777 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 778 DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) 779 780 #undef DO_FMLA_IDX 781 782 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 783 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 784 { \ 785 intptr_t i, oprsz = simd_oprsz(desc); \ 786 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 787 bool q = false; \ 788 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 789 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 790 if (dd < MIN) { \ 791 dd = MIN; \ 792 q = true; \ 793 } else if (dd > MAX) { \ 794 dd = MAX; \ 795 q = true; \ 796 } \ 797 d[i] = dd; \ 798 } \ 799 if (q) { \ 800 uint32_t *qc = vq; \ 801 qc[0] = 1; \ 802 } \ 803 clear_tail(d, oprsz, simd_maxsz(desc)); \ 804 } 805 806 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 807 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 808 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 809 810 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 811 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 812 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 813 814 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 815 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 816 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 817 818 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 819 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 820 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 821 822 #undef DO_SAT 823 824 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 825 void *vm, uint32_t desc) 826 { 827 intptr_t i, oprsz = simd_oprsz(desc); 828 uint64_t *d = vd, *n = vn, *m = vm; 829 bool q = false; 830 831 for (i = 0; i < oprsz / 8; i++) { 832 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 833 if (dd < nn) { 834 dd = UINT64_MAX; 835 q = true; 836 } 837 d[i] = dd; 838 } 839 if (q) { 840 uint32_t *qc = vq; 841 qc[0] = 1; 842 } 843 clear_tail(d, oprsz, simd_maxsz(desc)); 844 } 845 846 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 847 void *vm, uint32_t desc) 848 { 849 intptr_t i, oprsz = simd_oprsz(desc); 850 uint64_t *d = vd, *n = vn, *m = vm; 851 bool q = false; 852 853 for (i = 0; i < oprsz / 8; i++) { 854 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 855 if (nn < mm) { 856 dd = 0; 857 q = true; 858 } 859 d[i] = dd; 860 } 861 if (q) { 862 uint32_t *qc = vq; 863 qc[0] = 1; 864 } 865 clear_tail(d, oprsz, simd_maxsz(desc)); 866 } 867 868 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 869 void *vm, uint32_t desc) 870 { 871 intptr_t i, oprsz = simd_oprsz(desc); 872 int64_t *d = vd, *n = vn, *m = vm; 873 bool q = false; 874 875 for (i = 0; i < oprsz / 8; i++) { 876 int64_t nn = n[i], mm = m[i], dd = nn + mm; 877 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 878 dd = (nn >> 63) ^ ~INT64_MIN; 879 q = true; 880 } 881 d[i] = dd; 882 } 883 if (q) { 884 uint32_t *qc = vq; 885 qc[0] = 1; 886 } 887 clear_tail(d, oprsz, simd_maxsz(desc)); 888 } 889 890 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 891 void *vm, uint32_t desc) 892 { 893 intptr_t i, oprsz = simd_oprsz(desc); 894 int64_t *d = vd, *n = vn, *m = vm; 895 bool q = false; 896 897 for (i = 0; i < oprsz / 8; i++) { 898 int64_t nn = n[i], mm = m[i], dd = nn - mm; 899 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 900 dd = (nn >> 63) ^ ~INT64_MIN; 901 q = true; 902 } 903 d[i] = dd; 904 } 905 if (q) { 906 uint32_t *qc = vq; 907 qc[0] = 1; 908 } 909 clear_tail(d, oprsz, simd_maxsz(desc)); 910 } 911 912 913 #define DO_SRA(NAME, TYPE) \ 914 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 915 { \ 916 intptr_t i, oprsz = simd_oprsz(desc); \ 917 int shift = simd_data(desc); \ 918 TYPE *d = vd, *n = vn; \ 919 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 920 d[i] += n[i] >> shift; \ 921 } \ 922 clear_tail(d, oprsz, simd_maxsz(desc)); \ 923 } 924 925 DO_SRA(gvec_ssra_b, int8_t) 926 DO_SRA(gvec_ssra_h, int16_t) 927 DO_SRA(gvec_ssra_s, int32_t) 928 DO_SRA(gvec_ssra_d, int64_t) 929 930 DO_SRA(gvec_usra_b, uint8_t) 931 DO_SRA(gvec_usra_h, uint16_t) 932 DO_SRA(gvec_usra_s, uint32_t) 933 DO_SRA(gvec_usra_d, uint64_t) 934 935 #undef DO_SRA 936 937 #define DO_RSHR(NAME, TYPE) \ 938 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 939 { \ 940 intptr_t i, oprsz = simd_oprsz(desc); \ 941 int shift = simd_data(desc); \ 942 TYPE *d = vd, *n = vn; \ 943 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 944 TYPE tmp = n[i] >> (shift - 1); \ 945 d[i] = (tmp >> 1) + (tmp & 1); \ 946 } \ 947 clear_tail(d, oprsz, simd_maxsz(desc)); \ 948 } 949 950 DO_RSHR(gvec_srshr_b, int8_t) 951 DO_RSHR(gvec_srshr_h, int16_t) 952 DO_RSHR(gvec_srshr_s, int32_t) 953 DO_RSHR(gvec_srshr_d, int64_t) 954 955 DO_RSHR(gvec_urshr_b, uint8_t) 956 DO_RSHR(gvec_urshr_h, uint16_t) 957 DO_RSHR(gvec_urshr_s, uint32_t) 958 DO_RSHR(gvec_urshr_d, uint64_t) 959 960 #undef DO_RSHR 961 962 #define DO_RSRA(NAME, TYPE) \ 963 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 964 { \ 965 intptr_t i, oprsz = simd_oprsz(desc); \ 966 int shift = simd_data(desc); \ 967 TYPE *d = vd, *n = vn; \ 968 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 969 TYPE tmp = n[i] >> (shift - 1); \ 970 d[i] += (tmp >> 1) + (tmp & 1); \ 971 } \ 972 clear_tail(d, oprsz, simd_maxsz(desc)); \ 973 } 974 975 DO_RSRA(gvec_srsra_b, int8_t) 976 DO_RSRA(gvec_srsra_h, int16_t) 977 DO_RSRA(gvec_srsra_s, int32_t) 978 DO_RSRA(gvec_srsra_d, int64_t) 979 980 DO_RSRA(gvec_ursra_b, uint8_t) 981 DO_RSRA(gvec_ursra_h, uint16_t) 982 DO_RSRA(gvec_ursra_s, uint32_t) 983 DO_RSRA(gvec_ursra_d, uint64_t) 984 985 #undef DO_RSRA 986 987 #define DO_SRI(NAME, TYPE) \ 988 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 989 { \ 990 intptr_t i, oprsz = simd_oprsz(desc); \ 991 int shift = simd_data(desc); \ 992 TYPE *d = vd, *n = vn; \ 993 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 994 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 995 } \ 996 clear_tail(d, oprsz, simd_maxsz(desc)); \ 997 } 998 999 DO_SRI(gvec_sri_b, uint8_t) 1000 DO_SRI(gvec_sri_h, uint16_t) 1001 DO_SRI(gvec_sri_s, uint32_t) 1002 DO_SRI(gvec_sri_d, uint64_t) 1003 1004 #undef DO_SRI 1005 1006 #define DO_SLI(NAME, TYPE) \ 1007 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1008 { \ 1009 intptr_t i, oprsz = simd_oprsz(desc); \ 1010 int shift = simd_data(desc); \ 1011 TYPE *d = vd, *n = vn; \ 1012 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1013 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1014 } \ 1015 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1016 } 1017 1018 DO_SLI(gvec_sli_b, uint8_t) 1019 DO_SLI(gvec_sli_h, uint16_t) 1020 DO_SLI(gvec_sli_s, uint32_t) 1021 DO_SLI(gvec_sli_d, uint64_t) 1022 1023 #undef DO_SLI 1024 1025 /* 1026 * Convert float16 to float32, raising no exceptions and 1027 * preserving exceptional values, including SNaN. 1028 * This is effectively an unpack+repack operation. 1029 */ 1030 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1031 { 1032 const int f16_bias = 15; 1033 const int f32_bias = 127; 1034 uint32_t sign = extract32(f16, 15, 1); 1035 uint32_t exp = extract32(f16, 10, 5); 1036 uint32_t frac = extract32(f16, 0, 10); 1037 1038 if (exp == 0x1f) { 1039 /* Inf or NaN */ 1040 exp = 0xff; 1041 } else if (exp == 0) { 1042 /* Zero or denormal. */ 1043 if (frac != 0) { 1044 if (fz16) { 1045 frac = 0; 1046 } else { 1047 /* 1048 * Denormal; these are all normal float32. 1049 * Shift the fraction so that the msb is at bit 11, 1050 * then remove bit 11 as the implicit bit of the 1051 * normalized float32. Note that we still go through 1052 * the shift for normal numbers below, to put the 1053 * float32 fraction at the right place. 1054 */ 1055 int shift = clz32(frac) - 21; 1056 frac = (frac << shift) & 0x3ff; 1057 exp = f32_bias - f16_bias - shift + 1; 1058 } 1059 } 1060 } else { 1061 /* Normal number; adjust the bias. */ 1062 exp += f32_bias - f16_bias; 1063 } 1064 sign <<= 31; 1065 exp <<= 23; 1066 frac <<= 23 - 10; 1067 1068 return sign | exp | frac; 1069 } 1070 1071 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1072 { 1073 /* 1074 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1075 * Load the 2nd qword iff is_q & is_2. 1076 * Shift to the 2nd dword iff !is_q & is_2. 1077 * For !is_q & !is_2, the upper bits of the result are garbage. 1078 */ 1079 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1080 } 1081 1082 /* 1083 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1084 * as there is not yet SVE versions that might use blocking. 1085 */ 1086 1087 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1088 uint32_t desc, bool fz16) 1089 { 1090 intptr_t i, oprsz = simd_oprsz(desc); 1091 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1092 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1093 int is_q = oprsz == 16; 1094 uint64_t n_4, m_4; 1095 1096 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1097 n_4 = load4_f16(vn, is_q, is_2); 1098 m_4 = load4_f16(vm, is_q, is_2); 1099 1100 /* Negate all inputs for FMLSL at once. */ 1101 if (is_s) { 1102 n_4 ^= 0x8000800080008000ull; 1103 } 1104 1105 for (i = 0; i < oprsz / 4; i++) { 1106 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1107 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1108 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1109 } 1110 clear_tail(d, oprsz, simd_maxsz(desc)); 1111 } 1112 1113 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1114 void *venv, uint32_t desc) 1115 { 1116 CPUARMState *env = venv; 1117 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1118 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1119 } 1120 1121 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1122 void *venv, uint32_t desc) 1123 { 1124 CPUARMState *env = venv; 1125 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1126 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1127 } 1128 1129 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1130 uint32_t desc, bool fz16) 1131 { 1132 intptr_t i, oprsz = simd_oprsz(desc); 1133 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1134 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1135 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1136 int is_q = oprsz == 16; 1137 uint64_t n_4; 1138 float32 m_1; 1139 1140 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1141 n_4 = load4_f16(vn, is_q, is_2); 1142 1143 /* Negate all inputs for FMLSL at once. */ 1144 if (is_s) { 1145 n_4 ^= 0x8000800080008000ull; 1146 } 1147 1148 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1149 1150 for (i = 0; i < oprsz / 4; i++) { 1151 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1152 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1153 } 1154 clear_tail(d, oprsz, simd_maxsz(desc)); 1155 } 1156 1157 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1158 void *venv, uint32_t desc) 1159 { 1160 CPUARMState *env = venv; 1161 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1162 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1163 } 1164 1165 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1166 void *venv, uint32_t desc) 1167 { 1168 CPUARMState *env = venv; 1169 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1170 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1171 } 1172 1173 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1174 { 1175 intptr_t i, opr_sz = simd_oprsz(desc); 1176 int8_t *d = vd, *n = vn, *m = vm; 1177 1178 for (i = 0; i < opr_sz; ++i) { 1179 int8_t mm = m[i]; 1180 int8_t nn = n[i]; 1181 int8_t res = 0; 1182 if (mm >= 0) { 1183 if (mm < 8) { 1184 res = nn << mm; 1185 } 1186 } else { 1187 res = nn >> (mm > -8 ? -mm : 7); 1188 } 1189 d[i] = res; 1190 } 1191 clear_tail(d, opr_sz, simd_maxsz(desc)); 1192 } 1193 1194 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1195 { 1196 intptr_t i, opr_sz = simd_oprsz(desc); 1197 int16_t *d = vd, *n = vn, *m = vm; 1198 1199 for (i = 0; i < opr_sz / 2; ++i) { 1200 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1201 int16_t nn = n[i]; 1202 int16_t res = 0; 1203 if (mm >= 0) { 1204 if (mm < 16) { 1205 res = nn << mm; 1206 } 1207 } else { 1208 res = nn >> (mm > -16 ? -mm : 15); 1209 } 1210 d[i] = res; 1211 } 1212 clear_tail(d, opr_sz, simd_maxsz(desc)); 1213 } 1214 1215 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1216 { 1217 intptr_t i, opr_sz = simd_oprsz(desc); 1218 uint8_t *d = vd, *n = vn, *m = vm; 1219 1220 for (i = 0; i < opr_sz; ++i) { 1221 int8_t mm = m[i]; 1222 uint8_t nn = n[i]; 1223 uint8_t res = 0; 1224 if (mm >= 0) { 1225 if (mm < 8) { 1226 res = nn << mm; 1227 } 1228 } else { 1229 if (mm > -8) { 1230 res = nn >> -mm; 1231 } 1232 } 1233 d[i] = res; 1234 } 1235 clear_tail(d, opr_sz, simd_maxsz(desc)); 1236 } 1237 1238 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1239 { 1240 intptr_t i, opr_sz = simd_oprsz(desc); 1241 uint16_t *d = vd, *n = vn, *m = vm; 1242 1243 for (i = 0; i < opr_sz / 2; ++i) { 1244 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1245 uint16_t nn = n[i]; 1246 uint16_t res = 0; 1247 if (mm >= 0) { 1248 if (mm < 16) { 1249 res = nn << mm; 1250 } 1251 } else { 1252 if (mm > -16) { 1253 res = nn >> -mm; 1254 } 1255 } 1256 d[i] = res; 1257 } 1258 clear_tail(d, opr_sz, simd_maxsz(desc)); 1259 } 1260 1261 /* 1262 * 8x8->8 polynomial multiply. 1263 * 1264 * Polynomial multiplication is like integer multiplication except the 1265 * partial products are XORed, not added. 1266 * 1267 * TODO: expose this as a generic vector operation, as it is a common 1268 * crypto building block. 1269 */ 1270 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 1271 { 1272 intptr_t i, j, opr_sz = simd_oprsz(desc); 1273 uint64_t *d = vd, *n = vn, *m = vm; 1274 1275 for (i = 0; i < opr_sz / 8; ++i) { 1276 uint64_t nn = n[i]; 1277 uint64_t mm = m[i]; 1278 uint64_t rr = 0; 1279 1280 for (j = 0; j < 8; ++j) { 1281 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; 1282 rr ^= mm & mask; 1283 mm = (mm << 1) & 0xfefefefefefefefeull; 1284 nn >>= 1; 1285 } 1286 d[i] = rr; 1287 } 1288 clear_tail(d, opr_sz, simd_maxsz(desc)); 1289 } 1290 1291 /* 1292 * 64x64->128 polynomial multiply. 1293 * Because of the lanes are not accessed in strict columns, 1294 * this probably cannot be turned into a generic helper. 1295 */ 1296 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 1297 { 1298 intptr_t i, j, opr_sz = simd_oprsz(desc); 1299 intptr_t hi = simd_data(desc); 1300 uint64_t *d = vd, *n = vn, *m = vm; 1301 1302 for (i = 0; i < opr_sz / 8; i += 2) { 1303 uint64_t nn = n[i + hi]; 1304 uint64_t mm = m[i + hi]; 1305 uint64_t rhi = 0; 1306 uint64_t rlo = 0; 1307 1308 /* Bit 0 can only influence the low 64-bit result. */ 1309 if (nn & 1) { 1310 rlo = mm; 1311 } 1312 1313 for (j = 1; j < 64; ++j) { 1314 uint64_t mask = -((nn >> j) & 1); 1315 rlo ^= (mm << j) & mask; 1316 rhi ^= (mm >> (64 - j)) & mask; 1317 } 1318 d[i] = rlo; 1319 d[i + 1] = rhi; 1320 } 1321 clear_tail(d, opr_sz, simd_maxsz(desc)); 1322 } 1323 1324 /* 1325 * 8x8->16 polynomial multiply. 1326 * 1327 * The byte inputs are expanded to (or extracted from) half-words. 1328 * Note that neon and sve2 get the inputs from different positions. 1329 * This allows 4 bytes to be processed in parallel with uint64_t. 1330 */ 1331 1332 static uint64_t expand_byte_to_half(uint64_t x) 1333 { 1334 return (x & 0x000000ff) 1335 | ((x & 0x0000ff00) << 8) 1336 | ((x & 0x00ff0000) << 16) 1337 | ((x & 0xff000000) << 24); 1338 } 1339 1340 static uint64_t pmull_h(uint64_t op1, uint64_t op2) 1341 { 1342 uint64_t result = 0; 1343 int i; 1344 1345 for (i = 0; i < 8; ++i) { 1346 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; 1347 result ^= op2 & mask; 1348 op1 >>= 1; 1349 op2 <<= 1; 1350 } 1351 return result; 1352 } 1353 1354 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 1355 { 1356 int hi = simd_data(desc); 1357 uint64_t *d = vd, *n = vn, *m = vm; 1358 uint64_t nn = n[hi], mm = m[hi]; 1359 1360 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); 1361 nn >>= 32; 1362 mm >>= 32; 1363 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); 1364 1365 clear_tail(d, 16, simd_maxsz(desc)); 1366 } 1367 1368 #ifdef TARGET_AARCH64 1369 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 1370 { 1371 int shift = simd_data(desc) * 8; 1372 intptr_t i, opr_sz = simd_oprsz(desc); 1373 uint64_t *d = vd, *n = vn, *m = vm; 1374 1375 for (i = 0; i < opr_sz / 8; ++i) { 1376 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; 1377 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; 1378 1379 d[i] = pmull_h(nn, mm); 1380 } 1381 } 1382 #endif 1383 1384 #define DO_CMP0(NAME, TYPE, OP) \ 1385 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1386 { \ 1387 intptr_t i, opr_sz = simd_oprsz(desc); \ 1388 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1389 TYPE nn = *(TYPE *)(vn + i); \ 1390 *(TYPE *)(vd + i) = -(nn OP 0); \ 1391 } \ 1392 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 1393 } 1394 1395 DO_CMP0(gvec_ceq0_b, int8_t, ==) 1396 DO_CMP0(gvec_clt0_b, int8_t, <) 1397 DO_CMP0(gvec_cle0_b, int8_t, <=) 1398 DO_CMP0(gvec_cgt0_b, int8_t, >) 1399 DO_CMP0(gvec_cge0_b, int8_t, >=) 1400 1401 DO_CMP0(gvec_ceq0_h, int16_t, ==) 1402 DO_CMP0(gvec_clt0_h, int16_t, <) 1403 DO_CMP0(gvec_cle0_h, int16_t, <=) 1404 DO_CMP0(gvec_cgt0_h, int16_t, >) 1405 DO_CMP0(gvec_cge0_h, int16_t, >=) 1406 1407 #undef DO_CMP0 1408