1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/page-protection.h" 25 #include "exec/helper-proto.h" 26 #include "exec/tlb-flags.h" 27 #include "tcg/tcg-gvec-desc.h" 28 #include "fpu/softfloat.h" 29 #include "tcg/tcg.h" 30 #include "vec_internal.h" 31 #include "sve_ldst_internal.h" 32 #include "accel/tcg/cpu-ops.h" 33 #ifdef CONFIG_USER_ONLY 34 #include "user/page-protection.h" 35 #endif 36 37 38 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 39 * 40 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 41 * and bit 0 set if C is set. Compare the definitions of these variables 42 * within CPUARMState. 43 */ 44 45 /* For no G bits set, NZCV = C. */ 46 #define PREDTEST_INIT 1 47 48 /* This is an iterative function, called for each Pd and Pg word 49 * moving forward. 50 */ 51 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 52 { 53 if (likely(g)) { 54 /* Compute N from first D & G. 55 Use bit 2 to signal first G bit seen. */ 56 if (!(flags & 4)) { 57 flags |= ((d & (g & -g)) != 0) << 31; 58 flags |= 4; 59 } 60 61 /* Accumulate Z from each D & G. */ 62 flags |= ((d & g) != 0) << 1; 63 64 /* Compute C from last !(D & G). Replace previous. */ 65 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 66 } 67 return flags; 68 } 69 70 /* This is an iterative function, called for each Pd and Pg word 71 * moving backward. 72 */ 73 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 74 { 75 if (likely(g)) { 76 /* Compute C from first (i.e last) !(D & G). 77 Use bit 2 to signal first G bit seen. */ 78 if (!(flags & 4)) { 79 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 80 flags |= (d & pow2floor(g)) == 0; 81 } 82 83 /* Accumulate Z from each D & G. */ 84 flags |= ((d & g) != 0) << 1; 85 86 /* Compute N from last (i.e first) D & G. Replace previous. */ 87 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 88 } 89 return flags; 90 } 91 92 /* The same for a single word predicate. */ 93 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 94 { 95 return iter_predtest_fwd(d, g, PREDTEST_INIT); 96 } 97 98 /* The same for a multi-word predicate. */ 99 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 100 { 101 uint32_t flags = PREDTEST_INIT; 102 uint64_t *d = vd, *g = vg; 103 uintptr_t i = 0; 104 105 do { 106 flags = iter_predtest_fwd(d[i], g[i], flags); 107 } while (++i < words); 108 109 return flags; 110 } 111 112 /* Similarly for single word elements. */ 113 static inline uint64_t expand_pred_s(uint8_t byte) 114 { 115 static const uint64_t word[] = { 116 [0x01] = 0x00000000ffffffffull, 117 [0x10] = 0xffffffff00000000ull, 118 [0x11] = 0xffffffffffffffffull, 119 }; 120 return word[byte & 0x11]; 121 } 122 123 #define LOGICAL_PPPP(NAME, FUNC) \ 124 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 125 { \ 126 uintptr_t opr_sz = simd_oprsz(desc); \ 127 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 128 uintptr_t i; \ 129 for (i = 0; i < opr_sz / 8; ++i) { \ 130 d[i] = FUNC(n[i], m[i], g[i]); \ 131 } \ 132 } 133 134 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 135 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 136 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 137 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 138 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 139 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 140 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 141 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 142 143 LOGICAL_PPPP(sve_and_pppp, DO_AND) 144 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 145 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 146 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 147 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 148 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 149 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 150 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 151 152 #undef DO_AND 153 #undef DO_BIC 154 #undef DO_EOR 155 #undef DO_ORR 156 #undef DO_ORN 157 #undef DO_NOR 158 #undef DO_NAND 159 #undef DO_SEL 160 #undef LOGICAL_PPPP 161 162 /* Fully general three-operand expander, controlled by a predicate. 163 * This is complicated by the host-endian storage of the register file. 164 */ 165 /* ??? I don't expect the compiler could ever vectorize this itself. 166 * With some tables we can convert bit masks to byte masks, and with 167 * extra care wrt byte/word ordering we could use gcc generic vectors 168 * and do 16 bytes at a time. 169 */ 170 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 171 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 172 { \ 173 intptr_t i, opr_sz = simd_oprsz(desc); \ 174 for (i = 0; i < opr_sz; ) { \ 175 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 176 do { \ 177 if (pg & 1) { \ 178 TYPE nn = *(TYPE *)(vn + H(i)); \ 179 TYPE mm = *(TYPE *)(vm + H(i)); \ 180 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 181 } \ 182 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 183 } while (i & 15); \ 184 } \ 185 } 186 187 /* Similarly, specialized for 64-bit operands. */ 188 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 189 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 190 { \ 191 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 192 TYPE *d = vd, *n = vn, *m = vm; \ 193 uint8_t *pg = vg; \ 194 for (i = 0; i < opr_sz; i += 1) { \ 195 if (pg[H1(i)] & 1) { \ 196 TYPE nn = n[i], mm = m[i]; \ 197 d[i] = OP(nn, mm); \ 198 } \ 199 } \ 200 } 201 202 #define DO_AND(N, M) (N & M) 203 #define DO_EOR(N, M) (N ^ M) 204 #define DO_ORR(N, M) (N | M) 205 #define DO_BIC(N, M) (N & ~M) 206 #define DO_ADD(N, M) (N + M) 207 #define DO_SUB(N, M) (N - M) 208 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 209 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 210 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 211 #define DO_MUL(N, M) (N * M) 212 213 214 /* 215 * We must avoid the C undefined behaviour cases: division by 216 * zero and signed division of INT_MIN by -1. Both of these 217 * have architecturally defined required results for Arm. 218 * We special case all signed divisions by -1 to avoid having 219 * to deduce the minimum integer for the type involved. 220 */ 221 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 222 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 223 224 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 225 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 226 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 227 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 228 229 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 230 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 231 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 232 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 233 234 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 235 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 236 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 237 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 238 239 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 240 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 241 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 242 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 243 244 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 245 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 246 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 247 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 248 249 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 250 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 251 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 252 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 253 254 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 255 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 256 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 257 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 258 259 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 260 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 261 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 262 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 263 264 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 265 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 266 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 267 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 268 269 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 270 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 271 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 272 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 273 274 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 275 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 276 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 277 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 278 279 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 280 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 281 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 282 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 283 284 /* Because the computation type is at least twice as large as required, 285 these work for both signed and unsigned source types. */ 286 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 287 { 288 return (n * m) >> 8; 289 } 290 291 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 292 { 293 return (n * m) >> 16; 294 } 295 296 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 297 { 298 return (n * m) >> 32; 299 } 300 301 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 302 { 303 uint64_t lo, hi; 304 muls64(&lo, &hi, n, m); 305 return hi; 306 } 307 308 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 309 { 310 uint64_t lo, hi; 311 mulu64(&lo, &hi, n, m); 312 return hi; 313 } 314 315 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 316 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 317 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 318 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 319 320 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 321 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 322 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 323 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 324 325 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 326 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 327 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 328 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 329 330 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 331 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 332 333 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 334 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 335 336 /* Note that all bits of the shift are significant 337 and not modulo the element size. */ 338 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 339 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 340 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 341 342 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 343 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 344 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 345 346 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 347 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 348 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 349 350 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 351 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 352 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 353 354 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 355 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 356 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 357 358 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 359 { 360 int8_t n1 = n, n2 = n >> 8; 361 return m + n1 + n2; 362 } 363 364 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 365 { 366 int16_t n1 = n, n2 = n >> 16; 367 return m + n1 + n2; 368 } 369 370 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 371 { 372 int32_t n1 = n, n2 = n >> 32; 373 return m + n1 + n2; 374 } 375 376 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 377 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 378 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 379 380 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 381 { 382 uint8_t n1 = n, n2 = n >> 8; 383 return m + n1 + n2; 384 } 385 386 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 387 { 388 uint16_t n1 = n, n2 = n >> 16; 389 return m + n1 + n2; 390 } 391 392 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 393 { 394 uint32_t n1 = n, n2 = n >> 32; 395 return m + n1 + n2; 396 } 397 398 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 399 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 400 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 401 402 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 403 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 404 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 405 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 406 407 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 408 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 409 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 410 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 411 412 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 413 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 414 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 415 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 416 417 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 418 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 419 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 420 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 421 422 /* 423 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 424 * We pass in a pointer to a dummy saturation field to trigger 425 * the saturating arithmetic but discard the information about 426 * whether it has occurred. 427 */ 428 #define do_sqshl_b(n, m) \ 429 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 430 #define do_sqshl_h(n, m) \ 431 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 432 #define do_sqshl_s(n, m) \ 433 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 434 #define do_sqshl_d(n, m) \ 435 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 436 437 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 438 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 439 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 440 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 441 442 #define do_uqshl_b(n, m) \ 443 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 444 #define do_uqshl_h(n, m) \ 445 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 446 #define do_uqshl_s(n, m) \ 447 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 448 #define do_uqshl_d(n, m) \ 449 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 450 451 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 452 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 453 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 454 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 455 456 #define do_sqrshl_b(n, m) \ 457 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 458 #define do_sqrshl_h(n, m) \ 459 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 460 #define do_sqrshl_s(n, m) \ 461 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 462 #define do_sqrshl_d(n, m) \ 463 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 464 465 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 466 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 467 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 468 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 469 470 #undef do_sqrshl_d 471 472 #define do_uqrshl_b(n, m) \ 473 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 474 #define do_uqrshl_h(n, m) \ 475 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 476 #define do_uqrshl_s(n, m) \ 477 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 478 #define do_uqrshl_d(n, m) \ 479 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 480 481 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 482 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 483 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 484 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 485 486 #undef do_uqrshl_d 487 488 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 489 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 490 491 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 492 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 493 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 494 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 495 496 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 497 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 498 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 499 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 500 501 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 502 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 503 504 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 505 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 506 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 507 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 508 509 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 510 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 511 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 512 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 513 514 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 515 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 516 517 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 518 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 519 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 520 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 521 522 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 523 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 524 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 525 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 526 527 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 528 { 529 return val >= max ? max : val <= min ? min : val; 530 } 531 532 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 533 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 534 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 535 536 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 537 { 538 int64_t r = n + m; 539 if (((r ^ n) & ~(n ^ m)) < 0) { 540 /* Signed overflow. */ 541 return r < 0 ? INT64_MAX : INT64_MIN; 542 } 543 return r; 544 } 545 546 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 547 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 548 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 549 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 550 551 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 552 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 553 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 554 555 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 556 { 557 uint64_t r = n + m; 558 return r < n ? UINT64_MAX : r; 559 } 560 561 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 562 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 563 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 564 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 565 566 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 567 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 568 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 569 570 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 571 { 572 int64_t r = n - m; 573 if (((r ^ n) & (n ^ m)) < 0) { 574 /* Signed overflow. */ 575 return r < 0 ? INT64_MAX : INT64_MIN; 576 } 577 return r; 578 } 579 580 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 581 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 582 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 583 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 584 585 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 586 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 587 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 588 589 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 590 { 591 return n > m ? n - m : 0; 592 } 593 594 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 595 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 596 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 597 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 598 599 #define DO_SUQADD_B(n, m) \ 600 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 601 #define DO_SUQADD_H(n, m) \ 602 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 603 #define DO_SUQADD_S(n, m) \ 604 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 605 606 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 607 { 608 uint64_t r = n + m; 609 610 if (n < 0) { 611 /* Note that m - abs(n) cannot underflow. */ 612 if (r > INT64_MAX) { 613 /* Result is either very large positive or negative. */ 614 if (m > -n) { 615 /* m > abs(n), so r is a very large positive. */ 616 return INT64_MAX; 617 } 618 /* Result is negative. */ 619 } 620 } else { 621 /* Both inputs are positive: check for overflow. */ 622 if (r < m || r > INT64_MAX) { 623 return INT64_MAX; 624 } 625 } 626 return r; 627 } 628 629 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 630 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 631 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 632 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 633 634 #define DO_USQADD_B(n, m) \ 635 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 636 #define DO_USQADD_H(n, m) \ 637 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 638 #define DO_USQADD_S(n, m) \ 639 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 640 641 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 642 { 643 uint64_t r = n + m; 644 645 if (m < 0) { 646 return n < -m ? 0 : r; 647 } 648 return r < n ? UINT64_MAX : r; 649 } 650 651 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 652 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 653 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 654 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 655 656 #undef DO_ZPZZ 657 #undef DO_ZPZZ_D 658 659 /* 660 * Three operand expander, operating on element pairs. 661 * If the slot I is even, the elements from from VN {I, I+1}. 662 * If the slot I is odd, the elements from from VM {I-1, I}. 663 * Load all of the input elements in each pair before overwriting output. 664 */ 665 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 666 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 667 { \ 668 intptr_t i, opr_sz = simd_oprsz(desc); \ 669 for (i = 0; i < opr_sz; ) { \ 670 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 671 do { \ 672 TYPE n0 = *(TYPE *)(vn + H(i)); \ 673 TYPE m0 = *(TYPE *)(vm + H(i)); \ 674 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 675 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 676 if (pg & 1) { \ 677 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 678 } \ 679 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 680 if (pg & 1) { \ 681 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 682 } \ 683 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 684 } while (i & 15); \ 685 } \ 686 } 687 688 /* Similarly, specialized for 64-bit operands. */ 689 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 690 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 691 { \ 692 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 693 TYPE *d = vd, *n = vn, *m = vm; \ 694 uint8_t *pg = vg; \ 695 for (i = 0; i < opr_sz; i += 2) { \ 696 TYPE n0 = n[i], n1 = n[i + 1]; \ 697 TYPE m0 = m[i], m1 = m[i + 1]; \ 698 if (pg[H1(i)] & 1) { \ 699 d[i] = OP(n0, n1); \ 700 } \ 701 if (pg[H1(i + 1)] & 1) { \ 702 d[i + 1] = OP(m0, m1); \ 703 } \ 704 } \ 705 } 706 707 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 708 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 709 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 710 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 711 712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 714 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 715 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 716 717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 719 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 720 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 721 722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 724 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 725 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 726 727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 729 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 730 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 731 732 #undef DO_ZPZZ_PAIR 733 #undef DO_ZPZZ_PAIR_D 734 735 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 736 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 737 float_status *status, uint32_t desc) \ 738 { \ 739 intptr_t i, opr_sz = simd_oprsz(desc); \ 740 for (i = 0; i < opr_sz; ) { \ 741 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 742 do { \ 743 TYPE n0 = *(TYPE *)(vn + H(i)); \ 744 TYPE m0 = *(TYPE *)(vm + H(i)); \ 745 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 746 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 747 if (pg & 1) { \ 748 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 749 } \ 750 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 751 if (pg & 1) { \ 752 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 753 } \ 754 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 755 } while (i & 15); \ 756 } \ 757 } 758 759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 761 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 762 763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 765 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 766 767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 769 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 770 771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 773 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 774 775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 777 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 778 779 #undef DO_ZPZZ_PAIR_FP 780 781 /* Three-operand expander, controlled by a predicate, in which the 782 * third operand is "wide". That is, for D = N op M, the same 64-bit 783 * value of M is used with all of the narrower values of N. 784 */ 785 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 786 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 787 { \ 788 intptr_t i, opr_sz = simd_oprsz(desc); \ 789 for (i = 0; i < opr_sz; ) { \ 790 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 791 TYPEW mm = *(TYPEW *)(vm + i); \ 792 do { \ 793 if (pg & 1) { \ 794 TYPE nn = *(TYPE *)(vn + H(i)); \ 795 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 796 } \ 797 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 798 } while (i & 7); \ 799 } \ 800 } 801 802 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 803 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 804 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 805 806 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 807 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 808 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 809 810 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 811 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 812 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 813 814 #undef DO_ZPZW 815 816 /* Fully general two-operand expander, controlled by a predicate. 817 */ 818 #define DO_ZPZ(NAME, TYPE, H, OP) \ 819 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 820 { \ 821 intptr_t i, opr_sz = simd_oprsz(desc); \ 822 for (i = 0; i < opr_sz; ) { \ 823 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 824 do { \ 825 if (pg & 1) { \ 826 TYPE nn = *(TYPE *)(vn + H(i)); \ 827 *(TYPE *)(vd + H(i)) = OP(nn); \ 828 } \ 829 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 830 } while (i & 15); \ 831 } \ 832 } 833 834 /* Similarly, specialized for 64-bit operands. */ 835 #define DO_ZPZ_D(NAME, TYPE, OP) \ 836 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 837 { \ 838 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 839 TYPE *d = vd, *n = vn; \ 840 uint8_t *pg = vg; \ 841 for (i = 0; i < opr_sz; i += 1) { \ 842 if (pg[H1(i)] & 1) { \ 843 TYPE nn = n[i]; \ 844 d[i] = OP(nn); \ 845 } \ 846 } \ 847 } 848 849 #define DO_CLS_B(N) (clrsb32(N) - 24) 850 #define DO_CLS_H(N) (clrsb32(N) - 16) 851 852 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 853 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 854 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 855 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 856 857 #define DO_CLZ_B(N) (clz32(N) - 24) 858 #define DO_CLZ_H(N) (clz32(N) - 16) 859 860 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 861 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 862 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 863 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 864 865 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 866 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 867 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 868 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 869 870 #define DO_CNOT(N) (N == 0) 871 872 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 873 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 874 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 875 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 876 877 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 878 879 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 880 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 881 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 882 883 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N)) 884 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N)) 885 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N)) 886 887 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H) 888 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S) 889 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D) 890 891 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 892 893 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 894 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 895 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 896 897 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N)) 898 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N)) 899 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N)) 900 901 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H) 902 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S) 903 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D) 904 905 #define DO_NOT(N) (~N) 906 907 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 908 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 909 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 910 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 911 912 #define DO_SXTB(N) ((int8_t)N) 913 #define DO_SXTH(N) ((int16_t)N) 914 #define DO_SXTS(N) ((int32_t)N) 915 #define DO_UXTB(N) ((uint8_t)N) 916 #define DO_UXTH(N) ((uint16_t)N) 917 #define DO_UXTS(N) ((uint32_t)N) 918 919 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 920 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 921 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 922 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 923 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 924 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 925 926 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 927 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 928 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 929 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 930 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 931 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 932 933 #define DO_ABS(N) (N < 0 ? -N : N) 934 935 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 936 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 937 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 938 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 939 940 #define DO_NEG(N) (-N) 941 942 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 943 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 944 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 945 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 946 947 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 948 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 949 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 950 951 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 952 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 953 954 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 955 956 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 957 { 958 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 959 uint64_t *d = vd, *n = vn; 960 uint8_t *pg = vg; 961 962 for (i = 0; i < opr_sz; i += 2) { 963 if (pg[H1(i)] & 1) { 964 uint64_t n0 = n[i + 0]; 965 uint64_t n1 = n[i + 1]; 966 d[i + 0] = n1; 967 d[i + 1] = n0; 968 } 969 } 970 } 971 972 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 973 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 974 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 975 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 976 977 #define DO_SQABS(X) \ 978 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 979 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 980 981 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 982 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 983 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 984 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 985 986 #define DO_SQNEG(X) \ 987 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 988 x_ == min_ ? -min_ - 1 : -x_; }) 989 990 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 991 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 992 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 993 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 994 995 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 996 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 997 998 /* Three-operand expander, unpredicated, in which the third operand is "wide". 999 */ 1000 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 1001 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1002 { \ 1003 intptr_t i, opr_sz = simd_oprsz(desc); \ 1004 for (i = 0; i < opr_sz; ) { \ 1005 TYPEW mm = *(TYPEW *)(vm + i); \ 1006 do { \ 1007 TYPE nn = *(TYPE *)(vn + H(i)); \ 1008 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 1009 i += sizeof(TYPE); \ 1010 } while (i & 7); \ 1011 } \ 1012 } 1013 1014 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 1015 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 1016 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1017 1018 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1019 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1020 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1021 1022 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1023 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1024 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1025 1026 #undef DO_ZZW 1027 1028 #undef DO_CLS_B 1029 #undef DO_CLS_H 1030 #undef DO_CLZ_B 1031 #undef DO_CLZ_H 1032 #undef DO_CNOT 1033 #undef DO_FABS 1034 #undef DO_FNEG 1035 #undef DO_ABS 1036 #undef DO_NEG 1037 #undef DO_ZPZ 1038 #undef DO_ZPZ_D 1039 1040 /* 1041 * Three-operand expander, unpredicated, in which the two inputs are 1042 * selected from the top or bottom half of the wide column. 1043 */ 1044 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1045 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1046 { \ 1047 intptr_t i, opr_sz = simd_oprsz(desc); \ 1048 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1049 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1050 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1051 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1052 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1053 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1054 } \ 1055 } 1056 1057 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1058 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1059 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1060 1061 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1062 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1063 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1064 1065 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1066 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1067 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1068 1069 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1070 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1071 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1072 1073 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1074 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1075 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1076 1077 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1078 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1079 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1080 1081 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1082 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1083 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1084 1085 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1086 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1087 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1088 1089 /* Note that the multiply cannot overflow, but the doubling can. */ 1090 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1091 { 1092 int16_t val = n * m; 1093 return DO_SQADD_H(val, val); 1094 } 1095 1096 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1097 { 1098 int32_t val = n * m; 1099 return DO_SQADD_S(val, val); 1100 } 1101 1102 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1103 { 1104 int64_t val = n * m; 1105 return do_sqadd_d(val, val); 1106 } 1107 1108 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1109 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1110 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1111 1112 #undef DO_ZZZ_TB 1113 1114 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1115 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1116 { \ 1117 intptr_t i, opr_sz = simd_oprsz(desc); \ 1118 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1119 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1120 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1121 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1122 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1123 } \ 1124 } 1125 1126 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1127 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1128 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1129 1130 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1131 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1132 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1133 1134 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1135 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1136 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1137 1138 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1139 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1140 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1141 1142 #undef DO_ZZZ_WTB 1143 1144 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1145 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1146 { \ 1147 intptr_t i, opr_sz = simd_oprsz(desc); \ 1148 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1149 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1150 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1151 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1152 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1153 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1154 } \ 1155 } 1156 1157 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1158 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1159 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1160 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1161 1162 #undef DO_ZZZ_NTB 1163 1164 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1166 { \ 1167 intptr_t i, opr_sz = simd_oprsz(desc); \ 1168 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1169 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1170 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1171 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1172 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1173 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1174 } \ 1175 } 1176 1177 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1178 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1179 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1180 1181 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1182 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1183 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1184 1185 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1186 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1187 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1188 1189 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1190 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1191 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1192 1193 #define DO_NMUL(N, M) -(N * M) 1194 1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1196 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1197 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1198 1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1200 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1201 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1202 1203 #undef DO_ZZZW_ACC 1204 1205 #define DO_XTNB(NAME, TYPE, OP) \ 1206 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1207 { \ 1208 intptr_t i, opr_sz = simd_oprsz(desc); \ 1209 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1210 TYPE nn = *(TYPE *)(vn + i); \ 1211 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1212 *(TYPE *)(vd + i) = nn; \ 1213 } \ 1214 } 1215 1216 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1217 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1218 { \ 1219 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1220 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1221 TYPE nn = *(TYPE *)(vn + i); \ 1222 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1223 } \ 1224 } 1225 1226 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1227 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1228 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1229 1230 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1231 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1232 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1233 1234 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1235 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1236 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1237 1238 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1239 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1240 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1241 1242 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1243 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1244 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1245 1246 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1247 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1248 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1249 1250 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1251 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1252 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1253 1254 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1255 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1256 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1257 1258 #undef DO_XTNB 1259 #undef DO_XTNT 1260 1261 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1262 { 1263 intptr_t i, opr_sz = simd_oprsz(desc); 1264 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1265 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1266 uint32_t *a = va, *n = vn; 1267 uint64_t *d = vd, *m = vm; 1268 1269 for (i = 0; i < opr_sz / 8; ++i) { 1270 uint32_t e1 = a[2 * i + H4(0)]; 1271 uint32_t e2 = n[2 * i + sel] ^ inv; 1272 uint64_t c = extract64(m[i], 32, 1); 1273 /* Compute and store the entire 33-bit result at once. */ 1274 d[i] = c + e1 + e2; 1275 } 1276 } 1277 1278 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1279 { 1280 intptr_t i, opr_sz = simd_oprsz(desc); 1281 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1282 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1283 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1284 1285 for (i = 0; i < opr_sz / 8; i += 2) { 1286 Int128 e1 = int128_make64(a[i]); 1287 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1288 Int128 c = int128_make64(m[i + 1] & 1); 1289 Int128 r = int128_add(int128_add(e1, e2), c); 1290 d[i + 0] = int128_getlo(r); 1291 d[i + 1] = int128_gethi(r); 1292 } 1293 } 1294 1295 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1297 { \ 1298 intptr_t i, opr_sz = simd_oprsz(desc); \ 1299 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1300 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1301 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1302 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1303 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1304 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1305 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1306 } \ 1307 } 1308 1309 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1310 do_sqdmull_h, DO_SQADD_H) 1311 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1312 do_sqdmull_s, DO_SQADD_S) 1313 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1314 do_sqdmull_d, do_sqadd_d) 1315 1316 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1317 do_sqdmull_h, DO_SQSUB_H) 1318 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1319 do_sqdmull_s, DO_SQSUB_S) 1320 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1321 do_sqdmull_d, do_sqsub_d) 1322 1323 #undef DO_SQDMLAL 1324 1325 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1326 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1327 { \ 1328 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1329 int rot = simd_data(desc); \ 1330 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1331 bool sub_r = rot == 1 || rot == 2; \ 1332 bool sub_i = rot >= 2; \ 1333 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1334 for (i = 0; i < opr_sz; i += 2) { \ 1335 TYPE elt1_a = n[H(i + sel_a)]; \ 1336 TYPE elt2_a = m[H(i + sel_a)]; \ 1337 TYPE elt2_b = m[H(i + sel_b)]; \ 1338 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1339 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1340 } \ 1341 } 1342 1343 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1344 1345 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1346 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1347 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1348 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1349 1350 #define DO_SQRDMLAH_B(N, M, A, S) \ 1351 do_sqrdmlah_b(N, M, A, S, true) 1352 #define DO_SQRDMLAH_H(N, M, A, S) \ 1353 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1354 #define DO_SQRDMLAH_S(N, M, A, S) \ 1355 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1356 #define DO_SQRDMLAH_D(N, M, A, S) \ 1357 do_sqrdmlah_d(N, M, A, S, true) 1358 1359 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1360 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1363 1364 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1365 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1366 { \ 1367 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1368 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1369 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1370 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1371 bool sub_r = rot == 1 || rot == 2; \ 1372 bool sub_i = rot >= 2; \ 1373 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1374 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1375 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1376 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1377 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1378 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1379 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1380 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1381 } \ 1382 } \ 1383 } 1384 1385 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1386 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1387 1388 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1389 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1390 1391 #undef DO_CMLA 1392 #undef DO_CMLA_FUNC 1393 #undef DO_CMLA_IDX_FUNC 1394 #undef DO_SQRDMLAH_B 1395 #undef DO_SQRDMLAH_H 1396 #undef DO_SQRDMLAH_S 1397 #undef DO_SQRDMLAH_D 1398 1399 /* Note N and M are 4 elements bundled into one unit. */ 1400 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1401 int sel_a, int sel_b, int sub_i) 1402 { 1403 for (int i = 0; i <= 1; i++) { 1404 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1405 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1406 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1407 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1408 1409 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1410 } 1411 return a; 1412 } 1413 1414 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1415 int sel_a, int sel_b, int sub_i) 1416 { 1417 for (int i = 0; i <= 1; i++) { 1418 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1419 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1420 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1421 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1422 1423 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1424 } 1425 return a; 1426 } 1427 1428 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1429 void *va, uint32_t desc) 1430 { 1431 int opr_sz = simd_oprsz(desc); 1432 int rot = simd_data(desc); 1433 int sel_a = rot & 1; 1434 int sel_b = sel_a ^ 1; 1435 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1436 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1437 1438 for (int e = 0; e < opr_sz / 4; e++) { 1439 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1440 } 1441 } 1442 1443 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1444 void *va, uint32_t desc) 1445 { 1446 int opr_sz = simd_oprsz(desc); 1447 int rot = simd_data(desc); 1448 int sel_a = rot & 1; 1449 int sel_b = sel_a ^ 1; 1450 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1451 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1452 1453 for (int e = 0; e < opr_sz / 8; e++) { 1454 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1455 } 1456 } 1457 1458 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1459 void *va, uint32_t desc) 1460 { 1461 int opr_sz = simd_oprsz(desc); 1462 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1463 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1464 int sel_a = rot & 1; 1465 int sel_b = sel_a ^ 1; 1466 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1467 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1468 1469 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1470 uint32_t seg_m = m[seg + idx]; 1471 for (int e = 0; e < 4; e++) { 1472 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1473 sel_a, sel_b, sub_i); 1474 } 1475 } 1476 } 1477 1478 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1479 void *va, uint32_t desc) 1480 { 1481 int seg, opr_sz = simd_oprsz(desc); 1482 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1483 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1484 int sel_a = rot & 1; 1485 int sel_b = sel_a ^ 1; 1486 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1487 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1488 1489 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1490 uint64_t seg_m = m[seg + idx]; 1491 for (int e = 0; e < 2; e++) { 1492 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1493 sel_a, sel_b, sub_i); 1494 } 1495 } 1496 } 1497 1498 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1499 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1500 { \ 1501 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1502 intptr_t i, j, idx = simd_data(desc); \ 1503 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1504 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1505 TYPE mm = m[i]; \ 1506 for (j = 0; j < segment; j++) { \ 1507 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1508 } \ 1509 } \ 1510 } 1511 1512 #define DO_SQRDMLAH_H(N, M, A) \ 1513 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1514 #define DO_SQRDMLAH_S(N, M, A) \ 1515 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1516 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1517 1518 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1519 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1520 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1521 1522 #define DO_SQRDMLSH_H(N, M, A) \ 1523 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1524 #define DO_SQRDMLSH_S(N, M, A) \ 1525 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1526 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1527 1528 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1529 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1530 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1531 1532 #undef DO_ZZXZ 1533 1534 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1535 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1536 { \ 1537 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1538 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1539 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1540 for (i = 0; i < oprsz; i += 16) { \ 1541 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1542 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1543 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1544 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1545 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1546 } \ 1547 } \ 1548 } 1549 1550 #define DO_MLA(N, M, A) (A + N * M) 1551 1552 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1553 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1554 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1555 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1556 1557 #define DO_MLS(N, M, A) (A - N * M) 1558 1559 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1560 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1561 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1562 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1563 1564 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1565 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1566 1567 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1568 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1569 1570 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1571 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1572 1573 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1574 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1575 1576 #undef DO_MLA 1577 #undef DO_MLS 1578 #undef DO_ZZXW 1579 1580 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1581 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1582 { \ 1583 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1584 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1585 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1586 for (i = 0; i < oprsz; i += 16) { \ 1587 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1588 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1589 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1590 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1591 } \ 1592 } \ 1593 } 1594 1595 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1596 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1597 1598 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1599 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1600 1601 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1602 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1603 1604 #undef DO_ZZX 1605 1606 #define DO_BITPERM(NAME, TYPE, OP) \ 1607 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1608 { \ 1609 intptr_t i, opr_sz = simd_oprsz(desc); \ 1610 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1611 TYPE nn = *(TYPE *)(vn + i); \ 1612 TYPE mm = *(TYPE *)(vm + i); \ 1613 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1614 } \ 1615 } 1616 1617 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1618 { 1619 uint64_t res = 0; 1620 int db, rb = 0; 1621 1622 for (db = 0; db < n; ++db) { 1623 if ((mask >> db) & 1) { 1624 res |= ((data >> db) & 1) << rb; 1625 ++rb; 1626 } 1627 } 1628 return res; 1629 } 1630 1631 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1632 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1633 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1634 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1635 1636 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1637 { 1638 uint64_t res = 0; 1639 int rb, db = 0; 1640 1641 for (rb = 0; rb < n; ++rb) { 1642 if ((mask >> rb) & 1) { 1643 res |= ((data >> db) & 1) << rb; 1644 ++db; 1645 } 1646 } 1647 return res; 1648 } 1649 1650 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1651 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1652 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1653 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1654 1655 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1656 { 1657 uint64_t resm = 0, resu = 0; 1658 int db, rbm = 0, rbu = 0; 1659 1660 for (db = 0; db < n; ++db) { 1661 uint64_t val = (data >> db) & 1; 1662 if ((mask >> db) & 1) { 1663 resm |= val << rbm++; 1664 } else { 1665 resu |= val << rbu++; 1666 } 1667 } 1668 1669 return resm | (resu << rbm); 1670 } 1671 1672 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1673 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1674 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1675 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1676 1677 #undef DO_BITPERM 1678 1679 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1680 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1681 { \ 1682 intptr_t i, opr_sz = simd_oprsz(desc); \ 1683 int sub_r = simd_data(desc); \ 1684 if (sub_r) { \ 1685 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1686 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1687 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1688 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1689 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1690 acc_r = ADD_OP(acc_r, el2_i); \ 1691 acc_i = SUB_OP(acc_i, el2_r); \ 1692 *(TYPE *)(vd + H(i)) = acc_r; \ 1693 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1694 } \ 1695 } else { \ 1696 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1697 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1698 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1699 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1700 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1701 acc_r = SUB_OP(acc_r, el2_i); \ 1702 acc_i = ADD_OP(acc_i, el2_r); \ 1703 *(TYPE *)(vd + H(i)) = acc_r; \ 1704 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1705 } \ 1706 } \ 1707 } 1708 1709 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1710 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1711 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1712 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1713 1714 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1715 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1716 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1717 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1718 1719 #undef DO_CADD 1720 1721 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1722 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1723 { \ 1724 intptr_t i, opr_sz = simd_oprsz(desc); \ 1725 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1726 int shift = simd_data(desc) >> 1; \ 1727 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1728 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1729 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1730 } \ 1731 } 1732 1733 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1734 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1735 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1736 1737 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1738 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1739 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1740 1741 #undef DO_ZZI_SHLL 1742 1743 /* Two-operand reduction expander, controlled by a predicate. 1744 * The difference between TYPERED and TYPERET has to do with 1745 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1746 * but TYPERET must be unsigned so that e.g. a 32-bit value 1747 * is not sign-extended to the ABI uint64_t return type. 1748 */ 1749 /* ??? If we were to vectorize this by hand the reduction ordering 1750 * would change. For integer operands, this is perfectly fine. 1751 */ 1752 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1753 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1754 { \ 1755 intptr_t i, opr_sz = simd_oprsz(desc); \ 1756 TYPERED ret = INIT; \ 1757 for (i = 0; i < opr_sz; ) { \ 1758 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1759 do { \ 1760 if (pg & 1) { \ 1761 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1762 ret = OP(ret, nn); \ 1763 } \ 1764 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1765 } while (i & 15); \ 1766 } \ 1767 return (TYPERET)ret; \ 1768 } 1769 1770 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1771 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1772 { \ 1773 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1774 TYPEE *n = vn; \ 1775 uint8_t *pg = vg; \ 1776 TYPER ret = INIT; \ 1777 for (i = 0; i < opr_sz; i += 1) { \ 1778 if (pg[H1(i)] & 1) { \ 1779 TYPEE nn = n[i]; \ 1780 ret = OP(ret, nn); \ 1781 } \ 1782 } \ 1783 return ret; \ 1784 } 1785 1786 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1787 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1788 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1789 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1790 1791 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1792 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1793 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1794 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1795 1796 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1797 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1798 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1799 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1800 1801 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1802 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1803 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1804 1805 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1806 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1807 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1808 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1809 1810 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1811 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1812 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1813 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1814 1815 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1816 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1817 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1818 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1819 1820 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1821 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1822 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1823 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1824 1825 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1826 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1827 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1828 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1829 1830 #undef DO_VPZ 1831 #undef DO_VPZ_D 1832 1833 /* Two vector operand, one scalar operand, unpredicated. */ 1834 #define DO_ZZI(NAME, TYPE, OP) \ 1835 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1836 { \ 1837 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1838 TYPE s = s64, *d = vd, *n = vn; \ 1839 for (i = 0; i < opr_sz; ++i) { \ 1840 d[i] = OP(n[i], s); \ 1841 } \ 1842 } 1843 1844 #define DO_SUBR(X, Y) (Y - X) 1845 1846 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1847 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1848 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1849 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1850 1851 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1852 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1853 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1854 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1855 1856 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1857 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1858 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1859 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1860 1861 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1862 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1863 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1864 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1865 1866 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1867 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1868 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1869 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1870 1871 #undef DO_ZZI 1872 1873 #undef DO_AND 1874 #undef DO_ORR 1875 #undef DO_EOR 1876 #undef DO_BIC 1877 #undef DO_ADD 1878 #undef DO_SUB 1879 #undef DO_MAX 1880 #undef DO_MIN 1881 #undef DO_ABD 1882 #undef DO_MUL 1883 #undef DO_DIV 1884 #undef DO_ASR 1885 #undef DO_LSR 1886 #undef DO_LSL 1887 #undef DO_SUBR 1888 1889 /* Similar to the ARM LastActiveElement pseudocode function, except the 1890 result is multiplied by the element size. This includes the not found 1891 indication; e.g. not found for esz=3 is -8. */ 1892 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1893 { 1894 uint64_t mask = pred_esz_masks[esz]; 1895 intptr_t i = words; 1896 1897 do { 1898 uint64_t this_g = g[--i] & mask; 1899 if (this_g) { 1900 return i * 64 + (63 - clz64(this_g)); 1901 } 1902 } while (i > 0); 1903 return (intptr_t)-1 << esz; 1904 } 1905 1906 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1907 { 1908 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1909 uint32_t flags = PREDTEST_INIT; 1910 uint64_t *d = vd, *g = vg; 1911 intptr_t i = 0; 1912 1913 do { 1914 uint64_t this_d = d[i]; 1915 uint64_t this_g = g[i]; 1916 1917 if (this_g) { 1918 if (!(flags & 4)) { 1919 /* Set in D the first bit of G. */ 1920 this_d |= this_g & -this_g; 1921 d[i] = this_d; 1922 } 1923 flags = iter_predtest_fwd(this_d, this_g, flags); 1924 } 1925 } while (++i < words); 1926 1927 return flags; 1928 } 1929 1930 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1931 { 1932 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1933 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1934 uint32_t flags = PREDTEST_INIT; 1935 uint64_t *d = vd, *g = vg, esz_mask; 1936 intptr_t i, next; 1937 1938 next = last_active_element(vd, words, esz) + (1 << esz); 1939 esz_mask = pred_esz_masks[esz]; 1940 1941 /* Similar to the pseudocode for pnext, but scaled by ESZ 1942 so that we find the correct bit. */ 1943 if (next < words * 64) { 1944 uint64_t mask = -1; 1945 1946 if (next & 63) { 1947 mask = ~((1ull << (next & 63)) - 1); 1948 next &= -64; 1949 } 1950 do { 1951 uint64_t this_g = g[next / 64] & esz_mask & mask; 1952 if (this_g != 0) { 1953 next = (next & -64) + ctz64(this_g); 1954 break; 1955 } 1956 next += 64; 1957 mask = -1; 1958 } while (next < words * 64); 1959 } 1960 1961 i = 0; 1962 do { 1963 uint64_t this_d = 0; 1964 if (i == next / 64) { 1965 this_d = 1ull << (next & 63); 1966 } 1967 d[i] = this_d; 1968 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1969 } while (++i < words); 1970 1971 return flags; 1972 } 1973 1974 /* 1975 * Copy Zn into Zd, and store zero into inactive elements. 1976 * If inv, store zeros into the active elements. 1977 */ 1978 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1979 { 1980 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1981 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1982 uint64_t *d = vd, *n = vn; 1983 uint8_t *pg = vg; 1984 1985 for (i = 0; i < opr_sz; i += 1) { 1986 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1987 } 1988 } 1989 1990 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1991 { 1992 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1993 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1994 uint64_t *d = vd, *n = vn; 1995 uint8_t *pg = vg; 1996 1997 for (i = 0; i < opr_sz; i += 1) { 1998 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1999 } 2000 } 2001 2002 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 2003 { 2004 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2005 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2006 uint64_t *d = vd, *n = vn; 2007 uint8_t *pg = vg; 2008 2009 for (i = 0; i < opr_sz; i += 1) { 2010 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 2011 } 2012 } 2013 2014 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 2015 { 2016 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2017 uint64_t *d = vd, *n = vn; 2018 uint8_t *pg = vg; 2019 uint8_t inv = simd_data(desc); 2020 2021 for (i = 0; i < opr_sz; i += 1) { 2022 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2023 } 2024 } 2025 2026 /* Three-operand expander, immediate operand, controlled by a predicate. 2027 */ 2028 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2029 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2030 { \ 2031 intptr_t i, opr_sz = simd_oprsz(desc); \ 2032 TYPE imm = simd_data(desc); \ 2033 for (i = 0; i < opr_sz; ) { \ 2034 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2035 do { \ 2036 if (pg & 1) { \ 2037 TYPE nn = *(TYPE *)(vn + H(i)); \ 2038 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2039 } \ 2040 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2041 } while (i & 15); \ 2042 } \ 2043 } 2044 2045 /* Similarly, specialized for 64-bit operands. */ 2046 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2047 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2048 { \ 2049 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2050 TYPE *d = vd, *n = vn; \ 2051 TYPE imm = simd_data(desc); \ 2052 uint8_t *pg = vg; \ 2053 for (i = 0; i < opr_sz; i += 1) { \ 2054 if (pg[H1(i)] & 1) { \ 2055 TYPE nn = n[i]; \ 2056 d[i] = OP(nn, imm); \ 2057 } \ 2058 } \ 2059 } 2060 2061 #define DO_SHR(N, M) (N >> M) 2062 #define DO_SHL(N, M) (N << M) 2063 2064 /* Arithmetic shift right for division. This rounds negative numbers 2065 toward zero as per signed division. Therefore before shifting, 2066 when N is negative, add 2**M-1. */ 2067 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2068 2069 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2070 { 2071 if (likely(sh < 64)) { 2072 return (x >> sh) + ((x >> (sh - 1)) & 1); 2073 } else if (sh == 64) { 2074 return x >> 63; 2075 } else { 2076 return 0; 2077 } 2078 } 2079 2080 static inline int64_t do_srshr(int64_t x, unsigned sh) 2081 { 2082 if (likely(sh < 64)) { 2083 return (x >> sh) + ((x >> (sh - 1)) & 1); 2084 } else { 2085 /* Rounding the sign bit always produces 0. */ 2086 return 0; 2087 } 2088 } 2089 2090 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2091 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2092 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2093 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2094 2095 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2096 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2097 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2098 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2099 2100 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2101 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2102 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2103 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2104 2105 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2106 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2107 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2108 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2109 2110 /* SVE2 bitwise shift by immediate */ 2111 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2112 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2113 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2114 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2115 2116 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2117 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2118 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2119 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2120 2121 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2122 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2123 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2124 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2125 2126 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2127 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2128 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2129 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2130 2131 #define do_suqrshl_b(n, m) \ 2132 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2133 #define do_suqrshl_h(n, m) \ 2134 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2135 #define do_suqrshl_s(n, m) \ 2136 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2137 #define do_suqrshl_d(n, m) \ 2138 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2139 2140 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2141 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2142 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2143 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2144 2145 #undef DO_ASRD 2146 #undef DO_ZPZI 2147 #undef DO_ZPZI_D 2148 2149 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2150 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2151 { \ 2152 intptr_t i, opr_sz = simd_oprsz(desc); \ 2153 int shift = simd_data(desc); \ 2154 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2155 TYPEW nn = *(TYPEW *)(vn + i); \ 2156 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2157 } \ 2158 } 2159 2160 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2161 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2162 { \ 2163 intptr_t i, opr_sz = simd_oprsz(desc); \ 2164 int shift = simd_data(desc); \ 2165 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2166 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2167 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2168 } \ 2169 } 2170 2171 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2172 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2173 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2174 2175 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2176 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2177 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2178 2179 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2180 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2181 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2182 2183 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2184 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2185 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2186 2187 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2188 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2189 #define DO_SQSHRUN_D(x, sh) \ 2190 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2191 2192 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2193 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2194 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2195 2196 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2197 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2198 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2199 2200 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2201 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2202 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2203 2204 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2205 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2206 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2207 2208 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2209 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2210 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2211 2212 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2213 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2214 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2215 2216 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2217 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2218 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2219 2220 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2221 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2222 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2223 2224 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2225 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2226 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2227 2228 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2229 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2230 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2231 2232 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2233 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2234 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2235 2236 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2237 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2238 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2239 2240 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2241 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2242 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2243 2244 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2245 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2246 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2247 2248 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2249 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2250 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2251 2252 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2253 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2254 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2255 2256 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2257 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2258 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2259 2260 #undef DO_SHRNB 2261 #undef DO_SHRNT 2262 2263 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2264 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2265 { \ 2266 intptr_t i, opr_sz = simd_oprsz(desc); \ 2267 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2268 TYPEW nn = *(TYPEW *)(vn + i); \ 2269 TYPEW mm = *(TYPEW *)(vm + i); \ 2270 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2271 } \ 2272 } 2273 2274 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2275 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2276 { \ 2277 intptr_t i, opr_sz = simd_oprsz(desc); \ 2278 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2279 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2280 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2281 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2282 } \ 2283 } 2284 2285 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2286 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2287 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2288 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2289 2290 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2291 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2292 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2293 2294 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2295 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2296 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2297 2298 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2299 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2300 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2301 2302 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2303 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2304 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2305 2306 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2307 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2308 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2309 2310 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2311 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2312 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2313 2314 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2315 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2316 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2317 2318 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2319 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2320 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2321 2322 #undef DO_RSUBHN 2323 #undef DO_SUBHN 2324 #undef DO_RADDHN 2325 #undef DO_ADDHN 2326 2327 #undef DO_BINOPNB 2328 2329 /* Fully general four-operand expander, controlled by a predicate. 2330 */ 2331 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2332 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2333 void *vg, uint32_t desc) \ 2334 { \ 2335 intptr_t i, opr_sz = simd_oprsz(desc); \ 2336 for (i = 0; i < opr_sz; ) { \ 2337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2338 do { \ 2339 if (pg & 1) { \ 2340 TYPE nn = *(TYPE *)(vn + H(i)); \ 2341 TYPE mm = *(TYPE *)(vm + H(i)); \ 2342 TYPE aa = *(TYPE *)(va + H(i)); \ 2343 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2344 } \ 2345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2346 } while (i & 15); \ 2347 } \ 2348 } 2349 2350 /* Similarly, specialized for 64-bit operands. */ 2351 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2352 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2353 void *vg, uint32_t desc) \ 2354 { \ 2355 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2356 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2357 uint8_t *pg = vg; \ 2358 for (i = 0; i < opr_sz; i += 1) { \ 2359 if (pg[H1(i)] & 1) { \ 2360 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2361 d[i] = OP(aa, nn, mm); \ 2362 } \ 2363 } \ 2364 } 2365 2366 #define DO_MLA(A, N, M) (A + N * M) 2367 #define DO_MLS(A, N, M) (A - N * M) 2368 2369 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2370 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2371 2372 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2373 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2374 2375 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2376 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2377 2378 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2379 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2380 2381 #undef DO_MLA 2382 #undef DO_MLS 2383 #undef DO_ZPZZZ 2384 #undef DO_ZPZZZ_D 2385 2386 void HELPER(sve_index_b)(void *vd, uint32_t start, 2387 uint32_t incr, uint32_t desc) 2388 { 2389 intptr_t i, opr_sz = simd_oprsz(desc); 2390 uint8_t *d = vd; 2391 for (i = 0; i < opr_sz; i += 1) { 2392 d[H1(i)] = start + i * incr; 2393 } 2394 } 2395 2396 void HELPER(sve_index_h)(void *vd, uint32_t start, 2397 uint32_t incr, uint32_t desc) 2398 { 2399 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2400 uint16_t *d = vd; 2401 for (i = 0; i < opr_sz; i += 1) { 2402 d[H2(i)] = start + i * incr; 2403 } 2404 } 2405 2406 void HELPER(sve_index_s)(void *vd, uint32_t start, 2407 uint32_t incr, uint32_t desc) 2408 { 2409 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2410 uint32_t *d = vd; 2411 for (i = 0; i < opr_sz; i += 1) { 2412 d[H4(i)] = start + i * incr; 2413 } 2414 } 2415 2416 void HELPER(sve_index_d)(void *vd, uint64_t start, 2417 uint64_t incr, uint32_t desc) 2418 { 2419 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2420 uint64_t *d = vd; 2421 for (i = 0; i < opr_sz; i += 1) { 2422 d[i] = start + i * incr; 2423 } 2424 } 2425 2426 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2427 { 2428 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2429 uint32_t sh = simd_data(desc); 2430 uint32_t *d = vd, *n = vn, *m = vm; 2431 for (i = 0; i < opr_sz; i += 1) { 2432 d[i] = n[i] + (m[i] << sh); 2433 } 2434 } 2435 2436 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2437 { 2438 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2439 uint64_t sh = simd_data(desc); 2440 uint64_t *d = vd, *n = vn, *m = vm; 2441 for (i = 0; i < opr_sz; i += 1) { 2442 d[i] = n[i] + (m[i] << sh); 2443 } 2444 } 2445 2446 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2447 { 2448 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2449 uint64_t sh = simd_data(desc); 2450 uint64_t *d = vd, *n = vn, *m = vm; 2451 for (i = 0; i < opr_sz; i += 1) { 2452 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2453 } 2454 } 2455 2456 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2457 { 2458 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2459 uint64_t sh = simd_data(desc); 2460 uint64_t *d = vd, *n = vn, *m = vm; 2461 for (i = 0; i < opr_sz; i += 1) { 2462 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2463 } 2464 } 2465 2466 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2467 { 2468 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2469 static const uint16_t coeff[] = { 2470 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2471 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2472 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2473 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2474 }; 2475 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2476 uint16_t *d = vd, *n = vn; 2477 2478 for (i = 0; i < opr_sz; i++) { 2479 uint16_t nn = n[i]; 2480 intptr_t idx = extract32(nn, 0, 5); 2481 uint16_t exp = extract32(nn, 5, 5); 2482 d[i] = coeff[idx] | (exp << 10); 2483 } 2484 } 2485 2486 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2487 { 2488 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2489 static const uint32_t coeff[] = { 2490 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2491 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2492 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2493 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2494 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2495 0x1ef532, 0x20b051, 0x227043, 0x243516, 2496 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2497 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2498 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2499 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2500 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2501 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2502 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2503 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2504 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2505 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2506 }; 2507 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2508 uint32_t *d = vd, *n = vn; 2509 2510 for (i = 0; i < opr_sz; i++) { 2511 uint32_t nn = n[i]; 2512 intptr_t idx = extract32(nn, 0, 6); 2513 uint32_t exp = extract32(nn, 6, 8); 2514 d[i] = coeff[idx] | (exp << 23); 2515 } 2516 } 2517 2518 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2519 { 2520 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2521 static const uint64_t coeff[] = { 2522 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2523 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2524 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2525 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2526 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2527 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2528 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2529 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2530 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2531 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2532 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2533 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2534 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2535 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2536 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2537 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2538 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2539 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2540 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2541 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2542 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2543 0xFA7C1819E90D8ull, 2544 }; 2545 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2546 uint64_t *d = vd, *n = vn; 2547 2548 for (i = 0; i < opr_sz; i++) { 2549 uint64_t nn = n[i]; 2550 intptr_t idx = extract32(nn, 0, 6); 2551 uint64_t exp = extract32(nn, 6, 11); 2552 d[i] = coeff[idx] | (exp << 52); 2553 } 2554 } 2555 2556 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2557 { 2558 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2559 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2560 uint16_t *d = vd, *n = vn, *m = vm; 2561 for (i = 0; i < opr_sz; i += 1) { 2562 uint16_t nn = n[i]; 2563 uint16_t mm = m[i]; 2564 if (mm & 1) { 2565 nn = float16_one; 2566 } 2567 if (mm & 2) { 2568 nn = float16_maybe_ah_chs(nn, fpcr_ah); 2569 } 2570 d[i] = nn; 2571 } 2572 } 2573 2574 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2575 { 2576 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2577 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2578 uint32_t *d = vd, *n = vn, *m = vm; 2579 for (i = 0; i < opr_sz; i += 1) { 2580 uint32_t nn = n[i]; 2581 uint32_t mm = m[i]; 2582 if (mm & 1) { 2583 nn = float32_one; 2584 } 2585 if (mm & 2) { 2586 nn = float32_maybe_ah_chs(nn, fpcr_ah); 2587 } 2588 d[i] = nn; 2589 } 2590 } 2591 2592 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2593 { 2594 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2595 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2596 uint64_t *d = vd, *n = vn, *m = vm; 2597 for (i = 0; i < opr_sz; i += 1) { 2598 uint64_t nn = n[i]; 2599 uint64_t mm = m[i]; 2600 if (mm & 1) { 2601 nn = float64_one; 2602 } 2603 if (mm & 2) { 2604 nn = float64_maybe_ah_chs(nn, fpcr_ah); 2605 } 2606 d[i] = nn; 2607 } 2608 } 2609 2610 /* 2611 * Signed saturating addition with scalar operand. 2612 */ 2613 2614 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2615 { 2616 intptr_t i, oprsz = simd_oprsz(desc); 2617 2618 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2619 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2620 } 2621 } 2622 2623 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2624 { 2625 intptr_t i, oprsz = simd_oprsz(desc); 2626 2627 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2628 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2629 } 2630 } 2631 2632 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2633 { 2634 intptr_t i, oprsz = simd_oprsz(desc); 2635 2636 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2637 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2638 } 2639 } 2640 2641 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2642 { 2643 intptr_t i, oprsz = simd_oprsz(desc); 2644 2645 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2646 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2647 } 2648 } 2649 2650 /* 2651 * Unsigned saturating addition with scalar operand. 2652 */ 2653 2654 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2655 { 2656 intptr_t i, oprsz = simd_oprsz(desc); 2657 2658 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2659 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2660 } 2661 } 2662 2663 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2664 { 2665 intptr_t i, oprsz = simd_oprsz(desc); 2666 2667 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2668 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2669 } 2670 } 2671 2672 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2673 { 2674 intptr_t i, oprsz = simd_oprsz(desc); 2675 2676 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2677 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2678 } 2679 } 2680 2681 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2682 { 2683 intptr_t i, oprsz = simd_oprsz(desc); 2684 2685 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2686 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2687 } 2688 } 2689 2690 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2691 { 2692 intptr_t i, oprsz = simd_oprsz(desc); 2693 2694 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2695 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2696 } 2697 } 2698 2699 /* Two operand predicated copy immediate with merge. All valid immediates 2700 * can fit within 17 signed bits in the simd_data field. 2701 */ 2702 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2703 uint64_t mm, uint32_t desc) 2704 { 2705 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2706 uint64_t *d = vd, *n = vn; 2707 uint8_t *pg = vg; 2708 2709 mm = dup_const(MO_8, mm); 2710 for (i = 0; i < opr_sz; i += 1) { 2711 uint64_t nn = n[i]; 2712 uint64_t pp = expand_pred_b(pg[H1(i)]); 2713 d[i] = (mm & pp) | (nn & ~pp); 2714 } 2715 } 2716 2717 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2718 uint64_t mm, uint32_t desc) 2719 { 2720 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2721 uint64_t *d = vd, *n = vn; 2722 uint8_t *pg = vg; 2723 2724 mm = dup_const(MO_16, mm); 2725 for (i = 0; i < opr_sz; i += 1) { 2726 uint64_t nn = n[i]; 2727 uint64_t pp = expand_pred_h(pg[H1(i)]); 2728 d[i] = (mm & pp) | (nn & ~pp); 2729 } 2730 } 2731 2732 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2733 uint64_t mm, uint32_t desc) 2734 { 2735 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2736 uint64_t *d = vd, *n = vn; 2737 uint8_t *pg = vg; 2738 2739 mm = dup_const(MO_32, mm); 2740 for (i = 0; i < opr_sz; i += 1) { 2741 uint64_t nn = n[i]; 2742 uint64_t pp = expand_pred_s(pg[H1(i)]); 2743 d[i] = (mm & pp) | (nn & ~pp); 2744 } 2745 } 2746 2747 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2748 uint64_t mm, uint32_t desc) 2749 { 2750 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2751 uint64_t *d = vd, *n = vn; 2752 uint8_t *pg = vg; 2753 2754 for (i = 0; i < opr_sz; i += 1) { 2755 uint64_t nn = n[i]; 2756 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2757 } 2758 } 2759 2760 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2761 { 2762 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2763 uint64_t *d = vd; 2764 uint8_t *pg = vg; 2765 2766 val = dup_const(MO_8, val); 2767 for (i = 0; i < opr_sz; i += 1) { 2768 d[i] = val & expand_pred_b(pg[H1(i)]); 2769 } 2770 } 2771 2772 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2773 { 2774 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2775 uint64_t *d = vd; 2776 uint8_t *pg = vg; 2777 2778 val = dup_const(MO_16, val); 2779 for (i = 0; i < opr_sz; i += 1) { 2780 d[i] = val & expand_pred_h(pg[H1(i)]); 2781 } 2782 } 2783 2784 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2785 { 2786 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2787 uint64_t *d = vd; 2788 uint8_t *pg = vg; 2789 2790 val = dup_const(MO_32, val); 2791 for (i = 0; i < opr_sz; i += 1) { 2792 d[i] = val & expand_pred_s(pg[H1(i)]); 2793 } 2794 } 2795 2796 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2797 { 2798 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2799 uint64_t *d = vd; 2800 uint8_t *pg = vg; 2801 2802 for (i = 0; i < opr_sz; i += 1) { 2803 d[i] = (pg[H1(i)] & 1 ? val : 0); 2804 } 2805 } 2806 2807 /* Big-endian hosts need to frob the byte indices. If the copy 2808 * happens to be 8-byte aligned, then no frobbing necessary. 2809 */ 2810 static void swap_memmove(void *vd, void *vs, size_t n) 2811 { 2812 uintptr_t d = (uintptr_t)vd; 2813 uintptr_t s = (uintptr_t)vs; 2814 uintptr_t o = (d | s | n) & 7; 2815 size_t i; 2816 2817 #if !HOST_BIG_ENDIAN 2818 o = 0; 2819 #endif 2820 switch (o) { 2821 case 0: 2822 memmove(vd, vs, n); 2823 break; 2824 2825 case 4: 2826 if (d < s || d >= s + n) { 2827 for (i = 0; i < n; i += 4) { 2828 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2829 } 2830 } else { 2831 for (i = n; i > 0; ) { 2832 i -= 4; 2833 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2834 } 2835 } 2836 break; 2837 2838 case 2: 2839 case 6: 2840 if (d < s || d >= s + n) { 2841 for (i = 0; i < n; i += 2) { 2842 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2843 } 2844 } else { 2845 for (i = n; i > 0; ) { 2846 i -= 2; 2847 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2848 } 2849 } 2850 break; 2851 2852 default: 2853 if (d < s || d >= s + n) { 2854 for (i = 0; i < n; i++) { 2855 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2856 } 2857 } else { 2858 for (i = n; i > 0; ) { 2859 i -= 1; 2860 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2861 } 2862 } 2863 break; 2864 } 2865 } 2866 2867 /* Similarly for memset of 0. */ 2868 static void swap_memzero(void *vd, size_t n) 2869 { 2870 uintptr_t d = (uintptr_t)vd; 2871 uintptr_t o = (d | n) & 7; 2872 size_t i; 2873 2874 /* Usually, the first bit of a predicate is set, so N is 0. */ 2875 if (likely(n == 0)) { 2876 return; 2877 } 2878 2879 #if !HOST_BIG_ENDIAN 2880 o = 0; 2881 #endif 2882 switch (o) { 2883 case 0: 2884 memset(vd, 0, n); 2885 break; 2886 2887 case 4: 2888 for (i = 0; i < n; i += 4) { 2889 *(uint32_t *)H1_4(d + i) = 0; 2890 } 2891 break; 2892 2893 case 2: 2894 case 6: 2895 for (i = 0; i < n; i += 2) { 2896 *(uint16_t *)H1_2(d + i) = 0; 2897 } 2898 break; 2899 2900 default: 2901 for (i = 0; i < n; i++) { 2902 *(uint8_t *)H1(d + i) = 0; 2903 } 2904 break; 2905 } 2906 } 2907 2908 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2909 { 2910 intptr_t opr_sz = simd_oprsz(desc); 2911 size_t n_ofs = simd_data(desc); 2912 size_t n_siz = opr_sz - n_ofs; 2913 2914 if (vd != vm) { 2915 swap_memmove(vd, vn + n_ofs, n_siz); 2916 swap_memmove(vd + n_siz, vm, n_ofs); 2917 } else if (vd != vn) { 2918 swap_memmove(vd + n_siz, vd, n_ofs); 2919 swap_memmove(vd, vn + n_ofs, n_siz); 2920 } else { 2921 /* vd == vn == vm. Need temp space. */ 2922 ARMVectorReg tmp; 2923 swap_memmove(&tmp, vm, n_ofs); 2924 swap_memmove(vd, vd + n_ofs, n_siz); 2925 memcpy(vd + n_siz, &tmp, n_ofs); 2926 } 2927 } 2928 2929 #define DO_INSR(NAME, TYPE, H) \ 2930 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2931 { \ 2932 intptr_t opr_sz = simd_oprsz(desc); \ 2933 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2934 *(TYPE *)(vd + H(0)) = val; \ 2935 } 2936 2937 DO_INSR(sve_insr_b, uint8_t, H1) 2938 DO_INSR(sve_insr_h, uint16_t, H1_2) 2939 DO_INSR(sve_insr_s, uint32_t, H1_4) 2940 DO_INSR(sve_insr_d, uint64_t, H1_8) 2941 2942 #undef DO_INSR 2943 2944 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2945 { 2946 intptr_t i, j, opr_sz = simd_oprsz(desc); 2947 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2948 uint64_t f = *(uint64_t *)(vn + i); 2949 uint64_t b = *(uint64_t *)(vn + j); 2950 *(uint64_t *)(vd + i) = bswap64(b); 2951 *(uint64_t *)(vd + j) = bswap64(f); 2952 } 2953 } 2954 2955 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2956 { 2957 intptr_t i, j, opr_sz = simd_oprsz(desc); 2958 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2959 uint64_t f = *(uint64_t *)(vn + i); 2960 uint64_t b = *(uint64_t *)(vn + j); 2961 *(uint64_t *)(vd + i) = hswap64(b); 2962 *(uint64_t *)(vd + j) = hswap64(f); 2963 } 2964 } 2965 2966 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2967 { 2968 intptr_t i, j, opr_sz = simd_oprsz(desc); 2969 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2970 uint64_t f = *(uint64_t *)(vn + i); 2971 uint64_t b = *(uint64_t *)(vn + j); 2972 *(uint64_t *)(vd + i) = rol64(b, 32); 2973 *(uint64_t *)(vd + j) = rol64(f, 32); 2974 } 2975 } 2976 2977 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2978 { 2979 intptr_t i, j, opr_sz = simd_oprsz(desc); 2980 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2981 uint64_t f = *(uint64_t *)(vn + i); 2982 uint64_t b = *(uint64_t *)(vn + j); 2983 *(uint64_t *)(vd + i) = b; 2984 *(uint64_t *)(vd + j) = f; 2985 } 2986 } 2987 2988 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2989 2990 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2991 bool is_tbx, tb_impl_fn *fn) 2992 { 2993 ARMVectorReg scratch; 2994 uintptr_t oprsz = simd_oprsz(desc); 2995 2996 if (unlikely(vd == vn)) { 2997 vn = memcpy(&scratch, vn, oprsz); 2998 } 2999 3000 fn(vd, vn, NULL, vm, oprsz, is_tbx); 3001 } 3002 3003 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 3004 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 3005 { 3006 ARMVectorReg scratch; 3007 uintptr_t oprsz = simd_oprsz(desc); 3008 3009 if (unlikely(vd == vn0)) { 3010 vn0 = memcpy(&scratch, vn0, oprsz); 3011 if (vd == vn1) { 3012 vn1 = vn0; 3013 } 3014 } else if (unlikely(vd == vn1)) { 3015 vn1 = memcpy(&scratch, vn1, oprsz); 3016 } 3017 3018 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 3019 } 3020 3021 #define DO_TB(SUFF, TYPE, H) \ 3022 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 3023 void *vm, uintptr_t oprsz, bool is_tbx) \ 3024 { \ 3025 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 3026 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 3027 for (i = 0; i < nelem; ++i) { \ 3028 TYPE index = indexes[H1(i)], val = 0; \ 3029 if (index < nelem) { \ 3030 val = tbl0[H(index)]; \ 3031 } else { \ 3032 index -= nelem; \ 3033 if (tbl1 && index < nelem) { \ 3034 val = tbl1[H(index)]; \ 3035 } else if (is_tbx) { \ 3036 continue; \ 3037 } \ 3038 } \ 3039 d[H(i)] = val; \ 3040 } \ 3041 } \ 3042 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3043 { \ 3044 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3045 } \ 3046 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3047 void *vm, uint32_t desc) \ 3048 { \ 3049 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3050 } \ 3051 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3052 { \ 3053 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3054 } 3055 3056 DO_TB(b, uint8_t, H1) 3057 DO_TB(h, uint16_t, H2) 3058 DO_TB(s, uint32_t, H4) 3059 DO_TB(d, uint64_t, H8) 3060 3061 #undef DO_TB 3062 3063 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3064 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3065 { \ 3066 intptr_t i, opr_sz = simd_oprsz(desc); \ 3067 TYPED *d = vd; \ 3068 TYPES *n = vn; \ 3069 ARMVectorReg tmp; \ 3070 if (unlikely(vn - vd < opr_sz)) { \ 3071 n = memcpy(&tmp, n, opr_sz / 2); \ 3072 } \ 3073 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3074 d[HD(i)] = n[HS(i)]; \ 3075 } \ 3076 } 3077 3078 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3079 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3080 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3081 3082 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3083 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3084 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3085 3086 #undef DO_UNPK 3087 3088 /* Mask of bits included in the even numbered predicates of width esz. 3089 * We also use this for expand_bits/compress_bits, and so extend the 3090 * same pattern out to 16-bit units. 3091 */ 3092 static const uint64_t even_bit_esz_masks[5] = { 3093 0x5555555555555555ull, 3094 0x3333333333333333ull, 3095 0x0f0f0f0f0f0f0f0full, 3096 0x00ff00ff00ff00ffull, 3097 0x0000ffff0000ffffull, 3098 }; 3099 3100 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3101 * For N==0, this corresponds to the operation that in qemu/bitops.h 3102 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3103 * section 7-2 Shuffling Bits. 3104 */ 3105 static uint64_t expand_bits(uint64_t x, int n) 3106 { 3107 int i; 3108 3109 x &= 0xffffffffu; 3110 for (i = 4; i >= n; i--) { 3111 int sh = 1 << i; 3112 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3113 } 3114 return x; 3115 } 3116 3117 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3118 * For N==0, this corresponds to the operation that in qemu/bitops.h 3119 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3120 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3121 */ 3122 static uint64_t compress_bits(uint64_t x, int n) 3123 { 3124 int i; 3125 3126 for (i = n; i <= 4; i++) { 3127 int sh = 1 << i; 3128 x &= even_bit_esz_masks[i]; 3129 x = (x >> sh) | x; 3130 } 3131 return x & 0xffffffffu; 3132 } 3133 3134 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3135 { 3136 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3137 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3138 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3139 int esize = 1 << esz; 3140 uint64_t *d = vd; 3141 intptr_t i; 3142 3143 if (oprsz <= 8) { 3144 uint64_t nn = *(uint64_t *)vn; 3145 uint64_t mm = *(uint64_t *)vm; 3146 int half = 4 * oprsz; 3147 3148 nn = extract64(nn, high * half, half); 3149 mm = extract64(mm, high * half, half); 3150 nn = expand_bits(nn, esz); 3151 mm = expand_bits(mm, esz); 3152 d[0] = nn | (mm << esize); 3153 } else { 3154 ARMPredicateReg tmp; 3155 3156 /* We produce output faster than we consume input. 3157 Therefore we must be mindful of possible overlap. */ 3158 if (vd == vn) { 3159 vn = memcpy(&tmp, vn, oprsz); 3160 if (vd == vm) { 3161 vm = vn; 3162 } 3163 } else if (vd == vm) { 3164 vm = memcpy(&tmp, vm, oprsz); 3165 } 3166 if (high) { 3167 high = oprsz >> 1; 3168 } 3169 3170 if ((oprsz & 7) == 0) { 3171 uint32_t *n = vn, *m = vm; 3172 high >>= 2; 3173 3174 for (i = 0; i < oprsz / 8; i++) { 3175 uint64_t nn = n[H4(high + i)]; 3176 uint64_t mm = m[H4(high + i)]; 3177 3178 nn = expand_bits(nn, esz); 3179 mm = expand_bits(mm, esz); 3180 d[i] = nn | (mm << esize); 3181 } 3182 } else { 3183 uint8_t *n = vn, *m = vm; 3184 uint16_t *d16 = vd; 3185 3186 for (i = 0; i < oprsz / 2; i++) { 3187 uint16_t nn = n[H1(high + i)]; 3188 uint16_t mm = m[H1(high + i)]; 3189 3190 nn = expand_bits(nn, esz); 3191 mm = expand_bits(mm, esz); 3192 d16[H2(i)] = nn | (mm << esize); 3193 } 3194 } 3195 } 3196 } 3197 3198 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3199 { 3200 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3201 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3202 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3203 uint64_t *d = vd, *n = vn, *m = vm; 3204 uint64_t l, h; 3205 intptr_t i; 3206 3207 if (oprsz <= 8) { 3208 l = compress_bits(n[0] >> odd, esz); 3209 h = compress_bits(m[0] >> odd, esz); 3210 d[0] = l | (h << (4 * oprsz)); 3211 } else { 3212 ARMPredicateReg tmp_m; 3213 intptr_t oprsz_16 = oprsz / 16; 3214 3215 if ((vm - vd) < (uintptr_t)oprsz) { 3216 m = memcpy(&tmp_m, vm, oprsz); 3217 } 3218 3219 for (i = 0; i < oprsz_16; i++) { 3220 l = n[2 * i + 0]; 3221 h = n[2 * i + 1]; 3222 l = compress_bits(l >> odd, esz); 3223 h = compress_bits(h >> odd, esz); 3224 d[i] = l | (h << 32); 3225 } 3226 3227 /* 3228 * For VL which is not a multiple of 512, the results from M do not 3229 * align nicely with the uint64_t for D. Put the aligned results 3230 * from M into TMP_M and then copy it into place afterward. 3231 */ 3232 if (oprsz & 15) { 3233 int final_shift = (oprsz & 15) * 2; 3234 3235 l = n[2 * i + 0]; 3236 h = n[2 * i + 1]; 3237 l = compress_bits(l >> odd, esz); 3238 h = compress_bits(h >> odd, esz); 3239 d[i] = l | (h << final_shift); 3240 3241 for (i = 0; i < oprsz_16; i++) { 3242 l = m[2 * i + 0]; 3243 h = m[2 * i + 1]; 3244 l = compress_bits(l >> odd, esz); 3245 h = compress_bits(h >> odd, esz); 3246 tmp_m.p[i] = l | (h << 32); 3247 } 3248 l = m[2 * i + 0]; 3249 h = m[2 * i + 1]; 3250 l = compress_bits(l >> odd, esz); 3251 h = compress_bits(h >> odd, esz); 3252 tmp_m.p[i] = l | (h << final_shift); 3253 3254 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3255 } else { 3256 for (i = 0; i < oprsz_16; i++) { 3257 l = m[2 * i + 0]; 3258 h = m[2 * i + 1]; 3259 l = compress_bits(l >> odd, esz); 3260 h = compress_bits(h >> odd, esz); 3261 d[oprsz_16 + i] = l | (h << 32); 3262 } 3263 } 3264 } 3265 } 3266 3267 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3268 { 3269 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3270 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3271 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3272 uint64_t *d = vd, *n = vn, *m = vm; 3273 uint64_t mask; 3274 int shr, shl; 3275 intptr_t i; 3276 3277 shl = 1 << esz; 3278 shr = 0; 3279 mask = even_bit_esz_masks[esz]; 3280 if (odd) { 3281 mask <<= shl; 3282 shr = shl; 3283 shl = 0; 3284 } 3285 3286 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3287 uint64_t nn = (n[i] & mask) >> shr; 3288 uint64_t mm = (m[i] & mask) << shl; 3289 d[i] = nn + mm; 3290 } 3291 } 3292 3293 /* Reverse units of 2**N bits. */ 3294 static uint64_t reverse_bits_64(uint64_t x, int n) 3295 { 3296 int i, sh; 3297 3298 x = bswap64(x); 3299 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3300 uint64_t mask = even_bit_esz_masks[i]; 3301 x = ((x & mask) << sh) | ((x >> sh) & mask); 3302 } 3303 return x; 3304 } 3305 3306 static uint8_t reverse_bits_8(uint8_t x, int n) 3307 { 3308 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3309 int i, sh; 3310 3311 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3312 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3313 } 3314 return x; 3315 } 3316 3317 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3318 { 3319 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3320 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3321 intptr_t i, oprsz_2 = oprsz / 2; 3322 3323 if (oprsz <= 8) { 3324 uint64_t l = *(uint64_t *)vn; 3325 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3326 *(uint64_t *)vd = l; 3327 } else if ((oprsz & 15) == 0) { 3328 for (i = 0; i < oprsz_2; i += 8) { 3329 intptr_t ih = oprsz - 8 - i; 3330 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3331 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3332 *(uint64_t *)(vd + i) = h; 3333 *(uint64_t *)(vd + ih) = l; 3334 } 3335 } else { 3336 for (i = 0; i < oprsz_2; i += 1) { 3337 intptr_t il = H1(i); 3338 intptr_t ih = H1(oprsz - 1 - i); 3339 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3340 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3341 *(uint8_t *)(vd + il) = h; 3342 *(uint8_t *)(vd + ih) = l; 3343 } 3344 } 3345 } 3346 3347 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3348 { 3349 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3350 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3351 uint64_t *d = vd; 3352 intptr_t i; 3353 3354 if (oprsz <= 8) { 3355 uint64_t nn = *(uint64_t *)vn; 3356 int half = 4 * oprsz; 3357 3358 nn = extract64(nn, high * half, half); 3359 nn = expand_bits(nn, 0); 3360 d[0] = nn; 3361 } else { 3362 ARMPredicateReg tmp_n; 3363 3364 /* We produce output faster than we consume input. 3365 Therefore we must be mindful of possible overlap. */ 3366 if ((vn - vd) < (uintptr_t)oprsz) { 3367 vn = memcpy(&tmp_n, vn, oprsz); 3368 } 3369 if (high) { 3370 high = oprsz >> 1; 3371 } 3372 3373 if ((oprsz & 7) == 0) { 3374 uint32_t *n = vn; 3375 high >>= 2; 3376 3377 for (i = 0; i < oprsz / 8; i++) { 3378 uint64_t nn = n[H4(high + i)]; 3379 d[i] = expand_bits(nn, 0); 3380 } 3381 } else { 3382 uint16_t *d16 = vd; 3383 uint8_t *n = vn; 3384 3385 for (i = 0; i < oprsz / 2; i++) { 3386 uint16_t nn = n[H1(high + i)]; 3387 d16[H2(i)] = expand_bits(nn, 0); 3388 } 3389 } 3390 } 3391 } 3392 3393 #define DO_ZIP(NAME, TYPE, H) \ 3394 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3395 { \ 3396 intptr_t oprsz = simd_oprsz(desc); \ 3397 intptr_t odd_ofs = simd_data(desc); \ 3398 intptr_t i, oprsz_2 = oprsz / 2; \ 3399 ARMVectorReg tmp_n, tmp_m; \ 3400 /* We produce output faster than we consume input. \ 3401 Therefore we must be mindful of possible overlap. */ \ 3402 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3403 vn = memcpy(&tmp_n, vn, oprsz); \ 3404 } \ 3405 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3406 vm = memcpy(&tmp_m, vm, oprsz); \ 3407 } \ 3408 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3409 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3410 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3411 *(TYPE *)(vm + odd_ofs + H(i)); \ 3412 } \ 3413 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3414 memset(vd + oprsz - 16, 0, 16); \ 3415 } \ 3416 } 3417 3418 DO_ZIP(sve_zip_b, uint8_t, H1) 3419 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3420 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3421 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3422 DO_ZIP(sve2_zip_q, Int128, ) 3423 3424 #define DO_UZP(NAME, TYPE, H) \ 3425 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3426 { \ 3427 intptr_t oprsz = simd_oprsz(desc); \ 3428 intptr_t odd_ofs = simd_data(desc); \ 3429 intptr_t i, p; \ 3430 ARMVectorReg tmp_m; \ 3431 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3432 vm = memcpy(&tmp_m, vm, oprsz); \ 3433 } \ 3434 i = 0, p = odd_ofs; \ 3435 do { \ 3436 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3437 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3438 } while (p < oprsz); \ 3439 p -= oprsz; \ 3440 do { \ 3441 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3442 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3443 } while (p < oprsz); \ 3444 tcg_debug_assert(i == oprsz); \ 3445 } 3446 3447 DO_UZP(sve_uzp_b, uint8_t, H1) 3448 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3449 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3450 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3451 DO_UZP(sve2_uzp_q, Int128, ) 3452 3453 #define DO_TRN(NAME, TYPE, H) \ 3454 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3455 { \ 3456 intptr_t oprsz = simd_oprsz(desc); \ 3457 intptr_t odd_ofs = simd_data(desc); \ 3458 intptr_t i; \ 3459 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3460 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3461 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3462 *(TYPE *)(vd + H(i + 0)) = ae; \ 3463 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3464 } \ 3465 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3466 memset(vd + oprsz - 16, 0, 16); \ 3467 } \ 3468 } 3469 3470 DO_TRN(sve_trn_b, uint8_t, H1) 3471 DO_TRN(sve_trn_h, uint16_t, H1_2) 3472 DO_TRN(sve_trn_s, uint32_t, H1_4) 3473 DO_TRN(sve_trn_d, uint64_t, H1_8) 3474 DO_TRN(sve2_trn_q, Int128, ) 3475 3476 #undef DO_ZIP 3477 #undef DO_UZP 3478 #undef DO_TRN 3479 3480 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3481 { 3482 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3483 uint32_t *d = vd, *n = vn; 3484 uint8_t *pg = vg; 3485 3486 for (i = j = 0; i < opr_sz; i++) { 3487 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3488 d[H4(j)] = n[H4(i)]; 3489 j++; 3490 } 3491 } 3492 for (; j < opr_sz; j++) { 3493 d[H4(j)] = 0; 3494 } 3495 } 3496 3497 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3498 { 3499 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3500 uint64_t *d = vd, *n = vn; 3501 uint8_t *pg = vg; 3502 3503 for (i = j = 0; i < opr_sz; i++) { 3504 if (pg[H1(i)] & 1) { 3505 d[j] = n[i]; 3506 j++; 3507 } 3508 } 3509 for (; j < opr_sz; j++) { 3510 d[j] = 0; 3511 } 3512 } 3513 3514 /* Similar to the ARM LastActiveElement pseudocode function, except the 3515 * result is multiplied by the element size. This includes the not found 3516 * indication; e.g. not found for esz=3 is -8. 3517 */ 3518 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3519 { 3520 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3521 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3522 3523 return last_active_element(vg, words, esz); 3524 } 3525 3526 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3527 { 3528 intptr_t opr_sz = simd_oprsz(desc) / 8; 3529 int esz = simd_data(desc); 3530 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3531 intptr_t i, first_i, last_i; 3532 ARMVectorReg tmp; 3533 3534 first_i = last_i = 0; 3535 first_g = last_g = 0; 3536 3537 /* Find the extent of the active elements within VG. */ 3538 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3539 pg = *(uint64_t *)(vg + i) & mask; 3540 if (pg) { 3541 if (last_g == 0) { 3542 last_g = pg; 3543 last_i = i; 3544 } 3545 first_g = pg; 3546 first_i = i; 3547 } 3548 } 3549 3550 len = 0; 3551 if (first_g != 0) { 3552 first_i = first_i * 8 + ctz64(first_g); 3553 last_i = last_i * 8 + 63 - clz64(last_g); 3554 len = last_i - first_i + (1 << esz); 3555 if (vd == vm) { 3556 vm = memcpy(&tmp, vm, opr_sz * 8); 3557 } 3558 swap_memmove(vd, vn + first_i, len); 3559 } 3560 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3561 } 3562 3563 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3564 void *vg, uint32_t desc) 3565 { 3566 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3567 uint64_t *d = vd, *n = vn, *m = vm; 3568 uint8_t *pg = vg; 3569 3570 for (i = 0; i < opr_sz; i += 1) { 3571 uint64_t nn = n[i], mm = m[i]; 3572 uint64_t pp = expand_pred_b(pg[H1(i)]); 3573 d[i] = (nn & pp) | (mm & ~pp); 3574 } 3575 } 3576 3577 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3578 void *vg, uint32_t desc) 3579 { 3580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3581 uint64_t *d = vd, *n = vn, *m = vm; 3582 uint8_t *pg = vg; 3583 3584 for (i = 0; i < opr_sz; i += 1) { 3585 uint64_t nn = n[i], mm = m[i]; 3586 uint64_t pp = expand_pred_h(pg[H1(i)]); 3587 d[i] = (nn & pp) | (mm & ~pp); 3588 } 3589 } 3590 3591 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3592 void *vg, uint32_t desc) 3593 { 3594 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3595 uint64_t *d = vd, *n = vn, *m = vm; 3596 uint8_t *pg = vg; 3597 3598 for (i = 0; i < opr_sz; i += 1) { 3599 uint64_t nn = n[i], mm = m[i]; 3600 uint64_t pp = expand_pred_s(pg[H1(i)]); 3601 d[i] = (nn & pp) | (mm & ~pp); 3602 } 3603 } 3604 3605 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3606 void *vg, uint32_t desc) 3607 { 3608 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3609 uint64_t *d = vd, *n = vn, *m = vm; 3610 uint8_t *pg = vg; 3611 3612 for (i = 0; i < opr_sz; i += 1) { 3613 uint64_t nn = n[i], mm = m[i]; 3614 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3615 } 3616 } 3617 3618 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3619 void *vg, uint32_t desc) 3620 { 3621 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3622 Int128 *d = vd, *n = vn, *m = vm; 3623 uint16_t *pg = vg; 3624 3625 for (i = 0; i < opr_sz; i += 1) { 3626 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3627 } 3628 } 3629 3630 /* Two operand comparison controlled by a predicate. 3631 * ??? It is very tempting to want to be able to expand this inline 3632 * with x86 instructions, e.g. 3633 * 3634 * vcmpeqw zm, zn, %ymm0 3635 * vpmovmskb %ymm0, %eax 3636 * and $0x5555, %eax 3637 * and pg, %eax 3638 * 3639 * or even aarch64, e.g. 3640 * 3641 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3642 * cmeq v0.8h, zn, zm 3643 * and v0.8h, v0.8h, mask 3644 * addv h0, v0.8h 3645 * and v0.8b, pg 3646 * 3647 * However, coming up with an abstraction that allows vector inputs and 3648 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3649 * scalar outputs, is tricky. 3650 */ 3651 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3652 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3653 { \ 3654 intptr_t opr_sz = simd_oprsz(desc); \ 3655 uint32_t flags = PREDTEST_INIT; \ 3656 intptr_t i = opr_sz; \ 3657 do { \ 3658 uint64_t out = 0, pg; \ 3659 do { \ 3660 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3661 TYPE nn = *(TYPE *)(vn + H(i)); \ 3662 TYPE mm = *(TYPE *)(vm + H(i)); \ 3663 out |= nn OP mm; \ 3664 } while (i & 63); \ 3665 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3666 out &= pg; \ 3667 *(uint64_t *)(vd + (i >> 3)) = out; \ 3668 flags = iter_predtest_bwd(out, pg, flags); \ 3669 } while (i > 0); \ 3670 return flags; \ 3671 } 3672 3673 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3674 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3675 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3676 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3677 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3678 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3679 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3680 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3681 3682 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3683 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3684 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3685 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3686 3687 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3688 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3689 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3690 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3691 3692 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3693 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3694 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3695 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3696 3697 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3698 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3699 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3700 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3701 3702 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3703 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3704 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3705 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3706 3707 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3708 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3709 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3710 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3711 3712 #undef DO_CMP_PPZZ_B 3713 #undef DO_CMP_PPZZ_H 3714 #undef DO_CMP_PPZZ_S 3715 #undef DO_CMP_PPZZ_D 3716 #undef DO_CMP_PPZZ 3717 3718 /* Similar, but the second source is "wide". */ 3719 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3720 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3721 { \ 3722 intptr_t opr_sz = simd_oprsz(desc); \ 3723 uint32_t flags = PREDTEST_INIT; \ 3724 intptr_t i = opr_sz; \ 3725 do { \ 3726 uint64_t out = 0, pg; \ 3727 do { \ 3728 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3729 do { \ 3730 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3731 TYPE nn = *(TYPE *)(vn + H(i)); \ 3732 out |= nn OP mm; \ 3733 } while (i & 7); \ 3734 } while (i & 63); \ 3735 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3736 out &= pg; \ 3737 *(uint64_t *)(vd + (i >> 3)) = out; \ 3738 flags = iter_predtest_bwd(out, pg, flags); \ 3739 } while (i > 0); \ 3740 return flags; \ 3741 } 3742 3743 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3744 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3745 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3746 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3747 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3748 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3749 3750 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3751 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3752 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3753 3754 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3755 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3756 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3757 3758 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3759 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3760 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3761 3762 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3763 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3764 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3765 3766 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3767 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3768 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3769 3770 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3771 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3772 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3773 3774 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3775 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3776 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3777 3778 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3779 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3780 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3781 3782 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3783 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3784 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3785 3786 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3787 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3788 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3789 3790 #undef DO_CMP_PPZW_B 3791 #undef DO_CMP_PPZW_H 3792 #undef DO_CMP_PPZW_S 3793 #undef DO_CMP_PPZW 3794 3795 /* Similar, but the second source is immediate. */ 3796 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3797 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3798 { \ 3799 intptr_t opr_sz = simd_oprsz(desc); \ 3800 uint32_t flags = PREDTEST_INIT; \ 3801 TYPE mm = simd_data(desc); \ 3802 intptr_t i = opr_sz; \ 3803 do { \ 3804 uint64_t out = 0, pg; \ 3805 do { \ 3806 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3807 TYPE nn = *(TYPE *)(vn + H(i)); \ 3808 out |= nn OP mm; \ 3809 } while (i & 63); \ 3810 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3811 out &= pg; \ 3812 *(uint64_t *)(vd + (i >> 3)) = out; \ 3813 flags = iter_predtest_bwd(out, pg, flags); \ 3814 } while (i > 0); \ 3815 return flags; \ 3816 } 3817 3818 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3819 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3820 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3821 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3822 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3823 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3824 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3825 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3826 3827 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3828 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3829 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3830 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3831 3832 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3833 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3834 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3835 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3836 3837 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3838 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3839 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3840 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3841 3842 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3843 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3844 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3845 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3846 3847 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3848 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3849 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3850 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3851 3852 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3853 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3854 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3855 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3856 3857 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3858 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3859 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3860 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3861 3862 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3863 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3864 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3865 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3866 3867 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3868 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3869 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3870 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3871 3872 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3873 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3874 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3875 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3876 3877 #undef DO_CMP_PPZI_B 3878 #undef DO_CMP_PPZI_H 3879 #undef DO_CMP_PPZI_S 3880 #undef DO_CMP_PPZI_D 3881 #undef DO_CMP_PPZI 3882 3883 /* Similar to the ARM LastActive pseudocode function. */ 3884 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3885 { 3886 intptr_t i; 3887 3888 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3889 uint64_t pg = *(uint64_t *)(vg + i); 3890 if (pg) { 3891 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3892 } 3893 } 3894 return 0; 3895 } 3896 3897 /* Compute a mask into RETB that is true for all G, up to and including 3898 * (if after) or excluding (if !after) the first G & N. 3899 * Return true if BRK found. 3900 */ 3901 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3902 bool brk, bool after) 3903 { 3904 uint64_t b; 3905 3906 if (brk) { 3907 b = 0; 3908 } else if ((g & n) == 0) { 3909 /* For all G, no N are set; break not found. */ 3910 b = g; 3911 } else { 3912 /* Break somewhere in N. Locate it. */ 3913 b = g & n; /* guard true, pred true */ 3914 b = b & -b; /* first such */ 3915 if (after) { 3916 b = b | (b - 1); /* break after same */ 3917 } else { 3918 b = b - 1; /* break before same */ 3919 } 3920 brk = true; 3921 } 3922 3923 *retb = b; 3924 return brk; 3925 } 3926 3927 /* Compute a zeroing BRK. */ 3928 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3929 intptr_t oprsz, bool after) 3930 { 3931 bool brk = false; 3932 intptr_t i; 3933 3934 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3935 uint64_t this_b, this_g = g[i]; 3936 3937 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3938 d[i] = this_b & this_g; 3939 } 3940 } 3941 3942 /* Likewise, but also compute flags. */ 3943 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3944 intptr_t oprsz, bool after) 3945 { 3946 uint32_t flags = PREDTEST_INIT; 3947 bool brk = false; 3948 intptr_t i; 3949 3950 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3951 uint64_t this_b, this_d, this_g = g[i]; 3952 3953 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3954 d[i] = this_d = this_b & this_g; 3955 flags = iter_predtest_fwd(this_d, this_g, flags); 3956 } 3957 return flags; 3958 } 3959 3960 /* Compute a merging BRK. */ 3961 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3962 intptr_t oprsz, bool after) 3963 { 3964 bool brk = false; 3965 intptr_t i; 3966 3967 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3968 uint64_t this_b, this_g = g[i]; 3969 3970 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3971 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3972 } 3973 } 3974 3975 /* Likewise, but also compute flags. */ 3976 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3977 intptr_t oprsz, bool after) 3978 { 3979 uint32_t flags = PREDTEST_INIT; 3980 bool brk = false; 3981 intptr_t i; 3982 3983 for (i = 0; i < oprsz / 8; ++i) { 3984 uint64_t this_b, this_d = d[i], this_g = g[i]; 3985 3986 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3987 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3988 flags = iter_predtest_fwd(this_d, this_g, flags); 3989 } 3990 return flags; 3991 } 3992 3993 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3994 { 3995 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3996 * The compiler should turn this into 4 64-bit integer stores. 3997 */ 3998 memset(d, 0, sizeof(ARMPredicateReg)); 3999 return PREDTEST_INIT; 4000 } 4001 4002 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 4003 uint32_t pred_desc) 4004 { 4005 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4006 if (last_active_pred(vn, vg, oprsz)) { 4007 compute_brk_z(vd, vm, vg, oprsz, true); 4008 } else { 4009 do_zero(vd, oprsz); 4010 } 4011 } 4012 4013 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 4014 uint32_t pred_desc) 4015 { 4016 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4017 if (last_active_pred(vn, vg, oprsz)) { 4018 return compute_brks_z(vd, vm, vg, oprsz, true); 4019 } else { 4020 return do_zero(vd, oprsz); 4021 } 4022 } 4023 4024 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 4025 uint32_t pred_desc) 4026 { 4027 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4028 if (last_active_pred(vn, vg, oprsz)) { 4029 compute_brk_z(vd, vm, vg, oprsz, false); 4030 } else { 4031 do_zero(vd, oprsz); 4032 } 4033 } 4034 4035 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4036 uint32_t pred_desc) 4037 { 4038 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4039 if (last_active_pred(vn, vg, oprsz)) { 4040 return compute_brks_z(vd, vm, vg, oprsz, false); 4041 } else { 4042 return do_zero(vd, oprsz); 4043 } 4044 } 4045 4046 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4047 { 4048 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4049 compute_brk_z(vd, vn, vg, oprsz, true); 4050 } 4051 4052 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4053 { 4054 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4055 return compute_brks_z(vd, vn, vg, oprsz, true); 4056 } 4057 4058 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4059 { 4060 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4061 compute_brk_z(vd, vn, vg, oprsz, false); 4062 } 4063 4064 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4065 { 4066 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4067 return compute_brks_z(vd, vn, vg, oprsz, false); 4068 } 4069 4070 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4071 { 4072 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4073 compute_brk_m(vd, vn, vg, oprsz, true); 4074 } 4075 4076 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4077 { 4078 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4079 return compute_brks_m(vd, vn, vg, oprsz, true); 4080 } 4081 4082 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4083 { 4084 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4085 compute_brk_m(vd, vn, vg, oprsz, false); 4086 } 4087 4088 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4089 { 4090 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4091 return compute_brks_m(vd, vn, vg, oprsz, false); 4092 } 4093 4094 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4095 { 4096 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4097 if (!last_active_pred(vn, vg, oprsz)) { 4098 do_zero(vd, oprsz); 4099 } 4100 } 4101 4102 /* As if PredTest(Ones(PL), D, esz). */ 4103 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4104 uint64_t esz_mask) 4105 { 4106 uint32_t flags = PREDTEST_INIT; 4107 intptr_t i; 4108 4109 for (i = 0; i < oprsz / 8; i++) { 4110 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4111 } 4112 if (oprsz & 7) { 4113 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4114 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4115 } 4116 return flags; 4117 } 4118 4119 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4120 { 4121 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4122 if (last_active_pred(vn, vg, oprsz)) { 4123 return predtest_ones(vd, oprsz, -1); 4124 } else { 4125 return do_zero(vd, oprsz); 4126 } 4127 } 4128 4129 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4130 { 4131 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4132 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4133 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4134 intptr_t i; 4135 4136 for (i = 0; i < words; ++i) { 4137 uint64_t t = n[i] & g[i] & mask; 4138 sum += ctpop64(t); 4139 } 4140 return sum; 4141 } 4142 4143 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4144 { 4145 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4146 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4147 uint64_t esz_mask = pred_esz_masks[esz]; 4148 ARMPredicateReg *d = vd; 4149 uint32_t flags; 4150 intptr_t i; 4151 4152 /* Begin with a zero predicate register. */ 4153 flags = do_zero(d, oprsz); 4154 if (count == 0) { 4155 return flags; 4156 } 4157 4158 /* Set all of the requested bits. */ 4159 for (i = 0; i < count / 64; ++i) { 4160 d->p[i] = esz_mask; 4161 } 4162 if (count & 63) { 4163 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4164 } 4165 4166 return predtest_ones(d, oprsz, esz_mask); 4167 } 4168 4169 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4170 { 4171 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4172 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4173 uint64_t esz_mask = pred_esz_masks[esz]; 4174 ARMPredicateReg *d = vd; 4175 intptr_t i, invcount, oprbits; 4176 uint64_t bits; 4177 4178 if (count == 0) { 4179 return do_zero(d, oprsz); 4180 } 4181 4182 oprbits = oprsz * 8; 4183 tcg_debug_assert(count <= oprbits); 4184 4185 bits = esz_mask; 4186 if (oprbits & 63) { 4187 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4188 } 4189 4190 invcount = oprbits - count; 4191 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4192 d->p[i] = bits; 4193 bits = esz_mask; 4194 } 4195 4196 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4197 4198 while (--i >= 0) { 4199 d->p[i] = 0; 4200 } 4201 4202 return predtest_ones(d, oprsz, esz_mask); 4203 } 4204 4205 /* Recursive reduction on a function; 4206 * C.f. the ARM ARM function ReducePredicated. 4207 * 4208 * While it would be possible to write this without the DATA temporary, 4209 * it is much simpler to process the predicate register this way. 4210 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4211 * little to gain with a more complex non-recursive form. 4212 */ 4213 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4214 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4215 { \ 4216 if (n == 1) { \ 4217 return *data; \ 4218 } else { \ 4219 uintptr_t half = n / 2; \ 4220 TYPE lo = NAME##_reduce(data, status, half); \ 4221 TYPE hi = NAME##_reduce(data + half, status, half); \ 4222 return FUNC(lo, hi, status); \ 4223 } \ 4224 } \ 4225 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ 4226 { \ 4227 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4228 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4229 for (i = 0; i < oprsz; ) { \ 4230 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4231 do { \ 4232 TYPE nn = *(TYPE *)(vn + H(i)); \ 4233 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4234 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4235 } while (i & 15); \ 4236 } \ 4237 for (; i < maxsz; i += sizeof(TYPE)) { \ 4238 *(TYPE *)((void *)data + i) = IDENT; \ 4239 } \ 4240 return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ 4241 } 4242 4243 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero) 4244 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero) 4245 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero) 4246 4247 /* Identity is floatN_default_nan, without the function call. */ 4248 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00) 4249 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000) 4250 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) 4251 4252 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00) 4253 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000) 4254 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) 4255 4256 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity) 4257 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity) 4258 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity) 4259 4260 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity)) 4261 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity)) 4262 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity)) 4263 4264 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) 4265 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) 4266 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) 4267 4268 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh, 4269 float16_chs(float16_infinity)) 4270 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs, 4271 float32_chs(float32_infinity)) 4272 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd, 4273 float64_chs(float64_infinity)) 4274 4275 #undef DO_REDUCE 4276 4277 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4278 float_status *status, uint32_t desc) 4279 { 4280 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4281 float16 result = nn; 4282 4283 do { 4284 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4285 do { 4286 if (pg & 1) { 4287 float16 mm = *(float16 *)(vm + H1_2(i)); 4288 result = float16_add(result, mm, status); 4289 } 4290 i += sizeof(float16), pg >>= sizeof(float16); 4291 } while (i & 15); 4292 } while (i < opr_sz); 4293 4294 return result; 4295 } 4296 4297 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4298 float_status *status, uint32_t desc) 4299 { 4300 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4301 float32 result = nn; 4302 4303 do { 4304 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4305 do { 4306 if (pg & 1) { 4307 float32 mm = *(float32 *)(vm + H1_2(i)); 4308 result = float32_add(result, mm, status); 4309 } 4310 i += sizeof(float32), pg >>= sizeof(float32); 4311 } while (i & 15); 4312 } while (i < opr_sz); 4313 4314 return result; 4315 } 4316 4317 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4318 float_status *status, uint32_t desc) 4319 { 4320 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4321 uint64_t *m = vm; 4322 uint8_t *pg = vg; 4323 4324 for (i = 0; i < opr_sz; i++) { 4325 if (pg[H1(i)] & 1) { 4326 nn = float64_add(nn, m[i], status); 4327 } 4328 } 4329 4330 return nn; 4331 } 4332 4333 /* Fully general three-operand expander, controlled by a predicate, 4334 * With the extra float_status parameter. 4335 */ 4336 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4337 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4338 float_status *status, uint32_t desc) \ 4339 { \ 4340 intptr_t i = simd_oprsz(desc); \ 4341 uint64_t *g = vg; \ 4342 do { \ 4343 uint64_t pg = g[(i - 1) >> 6]; \ 4344 do { \ 4345 i -= sizeof(TYPE); \ 4346 if (likely((pg >> (i & 63)) & 1)) { \ 4347 TYPE nn = *(TYPE *)(vn + H(i)); \ 4348 TYPE mm = *(TYPE *)(vm + H(i)); \ 4349 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4350 } \ 4351 } while (i & 63); \ 4352 } while (i != 0); \ 4353 } 4354 4355 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4356 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4357 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4358 4359 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4360 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4361 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4362 4363 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4364 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4365 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4366 4367 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4368 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4369 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4370 4371 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4372 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4373 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4374 4375 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4376 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4377 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4378 4379 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) 4380 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) 4381 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) 4382 4383 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) 4384 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) 4385 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) 4386 4387 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4388 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4389 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4390 4391 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4392 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4393 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4394 4395 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4396 { 4397 return float16_abs(float16_sub(a, b, s)); 4398 } 4399 4400 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4401 { 4402 return float32_abs(float32_sub(a, b, s)); 4403 } 4404 4405 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4406 { 4407 return float64_abs(float64_sub(a, b, s)); 4408 } 4409 4410 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 4411 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat) 4412 { 4413 float16 r = float16_sub(op1, op2, stat); 4414 return float16_is_any_nan(r) ? r : float16_abs(r); 4415 } 4416 4417 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat) 4418 { 4419 float32 r = float32_sub(op1, op2, stat); 4420 return float32_is_any_nan(r) ? r : float32_abs(r); 4421 } 4422 4423 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat) 4424 { 4425 float64 r = float64_sub(op1, op2, stat); 4426 return float64_is_any_nan(r) ? r : float64_abs(r); 4427 } 4428 4429 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4430 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4431 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4432 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h) 4433 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s) 4434 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d) 4435 4436 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4437 { 4438 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4439 return float64_scalbn(a, b_int, s); 4440 } 4441 4442 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4443 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4444 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4445 4446 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4447 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4448 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4449 4450 #undef DO_ZPZZ_FP 4451 4452 /* Three-operand expander, with one scalar operand, controlled by 4453 * a predicate, with the extra float_status parameter. 4454 */ 4455 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4456 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4457 float_status *status, uint32_t desc) \ 4458 { \ 4459 intptr_t i = simd_oprsz(desc); \ 4460 uint64_t *g = vg; \ 4461 TYPE mm = scalar; \ 4462 do { \ 4463 uint64_t pg = g[(i - 1) >> 6]; \ 4464 do { \ 4465 i -= sizeof(TYPE); \ 4466 if (likely((pg >> (i & 63)) & 1)) { \ 4467 TYPE nn = *(TYPE *)(vn + H(i)); \ 4468 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4469 } \ 4470 } while (i & 63); \ 4471 } while (i != 0); \ 4472 } 4473 4474 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4475 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4476 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4477 4478 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4479 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4480 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4481 4482 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4483 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4484 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4485 4486 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4487 { 4488 return float16_sub(b, a, s); 4489 } 4490 4491 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4492 { 4493 return float32_sub(b, a, s); 4494 } 4495 4496 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4497 { 4498 return float64_sub(b, a, s); 4499 } 4500 4501 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4502 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4503 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4504 4505 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4506 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4507 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4508 4509 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4510 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4511 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4512 4513 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4514 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4515 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4516 4517 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4518 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4519 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4520 4521 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh) 4522 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs) 4523 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd) 4524 4525 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh) 4526 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins) 4527 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind) 4528 4529 /* Fully general two-operand expander, controlled by a predicate, 4530 * With the extra float_status parameter. 4531 */ 4532 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4533 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4534 float_status *status, uint32_t desc) \ 4535 { \ 4536 intptr_t i = simd_oprsz(desc); \ 4537 uint64_t *g = vg; \ 4538 do { \ 4539 uint64_t pg = g[(i - 1) >> 6]; \ 4540 do { \ 4541 i -= sizeof(TYPE); \ 4542 if (likely((pg >> (i & 63)) & 1)) { \ 4543 TYPE nn = *(TYPE *)(vn + H(i)); \ 4544 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4545 } \ 4546 } while (i & 63); \ 4547 } while (i != 0); \ 4548 } 4549 4550 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4551 * FZ16. When converting from fp16, this affects flushing input denormals; 4552 * when converting to fp16, this affects flushing output denormals. 4553 */ 4554 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4555 { 4556 bool save = get_flush_inputs_to_zero(fpst); 4557 float32 ret; 4558 4559 set_flush_inputs_to_zero(false, fpst); 4560 ret = float16_to_float32(f, true, fpst); 4561 set_flush_inputs_to_zero(save, fpst); 4562 return ret; 4563 } 4564 4565 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4566 { 4567 bool save = get_flush_inputs_to_zero(fpst); 4568 float64 ret; 4569 4570 set_flush_inputs_to_zero(false, fpst); 4571 ret = float16_to_float64(f, true, fpst); 4572 set_flush_inputs_to_zero(save, fpst); 4573 return ret; 4574 } 4575 4576 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4577 { 4578 bool save = get_flush_to_zero(fpst); 4579 float16 ret; 4580 4581 set_flush_to_zero(false, fpst); 4582 ret = float32_to_float16(f, true, fpst); 4583 set_flush_to_zero(save, fpst); 4584 return ret; 4585 } 4586 4587 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4588 { 4589 bool save = get_flush_to_zero(fpst); 4590 float16 ret; 4591 4592 set_flush_to_zero(false, fpst); 4593 ret = float64_to_float16(f, true, fpst); 4594 set_flush_to_zero(save, fpst); 4595 return ret; 4596 } 4597 4598 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4599 { 4600 if (float16_is_any_nan(f)) { 4601 float_raise(float_flag_invalid, s); 4602 return 0; 4603 } 4604 return float16_to_int16_round_to_zero(f, s); 4605 } 4606 4607 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4608 { 4609 if (float16_is_any_nan(f)) { 4610 float_raise(float_flag_invalid, s); 4611 return 0; 4612 } 4613 return float16_to_int64_round_to_zero(f, s); 4614 } 4615 4616 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4617 { 4618 if (float32_is_any_nan(f)) { 4619 float_raise(float_flag_invalid, s); 4620 return 0; 4621 } 4622 return float32_to_int64_round_to_zero(f, s); 4623 } 4624 4625 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4626 { 4627 if (float64_is_any_nan(f)) { 4628 float_raise(float_flag_invalid, s); 4629 return 0; 4630 } 4631 return float64_to_int64_round_to_zero(f, s); 4632 } 4633 4634 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4635 { 4636 if (float16_is_any_nan(f)) { 4637 float_raise(float_flag_invalid, s); 4638 return 0; 4639 } 4640 return float16_to_uint16_round_to_zero(f, s); 4641 } 4642 4643 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4644 { 4645 if (float16_is_any_nan(f)) { 4646 float_raise(float_flag_invalid, s); 4647 return 0; 4648 } 4649 return float16_to_uint64_round_to_zero(f, s); 4650 } 4651 4652 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4653 { 4654 if (float32_is_any_nan(f)) { 4655 float_raise(float_flag_invalid, s); 4656 return 0; 4657 } 4658 return float32_to_uint64_round_to_zero(f, s); 4659 } 4660 4661 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4662 { 4663 if (float64_is_any_nan(f)) { 4664 float_raise(float_flag_invalid, s); 4665 return 0; 4666 } 4667 return float64_to_uint64_round_to_zero(f, s); 4668 } 4669 4670 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4671 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4672 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4673 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4674 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4675 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4676 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4677 4678 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4679 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4680 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4681 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4682 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4683 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4684 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4685 4686 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4687 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4688 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4689 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4690 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4691 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4692 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4693 4694 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4695 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4696 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4697 4698 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4699 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4700 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4701 4702 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4703 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4704 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4705 4706 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4707 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4708 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4709 4710 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4711 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4712 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4713 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4714 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4715 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4716 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4717 4718 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4719 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4720 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4721 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4722 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4723 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4724 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4725 4726 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4727 { 4728 /* Extract frac to the top of the uint32_t. */ 4729 uint32_t frac = (uint32_t)a << (16 + 6); 4730 int16_t exp = extract32(a, 10, 5); 4731 4732 if (unlikely(exp == 0)) { 4733 if (frac != 0) { 4734 if (!get_flush_inputs_to_zero(s)) { 4735 /* denormal: bias - fractional_zeros */ 4736 return -15 - clz32(frac); 4737 } 4738 /* flush to zero */ 4739 float_raise(float_flag_input_denormal_flushed, s); 4740 } 4741 } else if (unlikely(exp == 0x1f)) { 4742 if (frac == 0) { 4743 return INT16_MAX; /* infinity */ 4744 } 4745 } else { 4746 /* normal: exp - bias */ 4747 return exp - 15; 4748 } 4749 /* nan or zero */ 4750 float_raise(float_flag_invalid, s); 4751 return INT16_MIN; 4752 } 4753 4754 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4755 { 4756 /* Extract frac to the top of the uint32_t. */ 4757 uint32_t frac = a << 9; 4758 int32_t exp = extract32(a, 23, 8); 4759 4760 if (unlikely(exp == 0)) { 4761 if (frac != 0) { 4762 if (!get_flush_inputs_to_zero(s)) { 4763 /* denormal: bias - fractional_zeros */ 4764 return -127 - clz32(frac); 4765 } 4766 /* flush to zero */ 4767 float_raise(float_flag_input_denormal_flushed, s); 4768 } 4769 } else if (unlikely(exp == 0xff)) { 4770 if (frac == 0) { 4771 return INT32_MAX; /* infinity */ 4772 } 4773 } else { 4774 /* normal: exp - bias */ 4775 return exp - 127; 4776 } 4777 /* nan or zero */ 4778 float_raise(float_flag_invalid, s); 4779 return INT32_MIN; 4780 } 4781 4782 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4783 { 4784 /* Extract frac to the top of the uint64_t. */ 4785 uint64_t frac = a << 12; 4786 int64_t exp = extract64(a, 52, 11); 4787 4788 if (unlikely(exp == 0)) { 4789 if (frac != 0) { 4790 if (!get_flush_inputs_to_zero(s)) { 4791 /* denormal: bias - fractional_zeros */ 4792 return -1023 - clz64(frac); 4793 } 4794 /* flush to zero */ 4795 float_raise(float_flag_input_denormal_flushed, s); 4796 } 4797 } else if (unlikely(exp == 0x7ff)) { 4798 if (frac == 0) { 4799 return INT64_MAX; /* infinity */ 4800 } 4801 } else { 4802 /* normal: exp - bias */ 4803 return exp - 1023; 4804 } 4805 /* nan or zero */ 4806 float_raise(float_flag_invalid, s); 4807 return INT64_MIN; 4808 } 4809 4810 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4811 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4812 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4813 4814 #undef DO_ZPZ_FP 4815 4816 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4817 float_status *status, uint32_t desc, 4818 uint16_t neg1, uint16_t neg3, int flags) 4819 { 4820 intptr_t i = simd_oprsz(desc); 4821 uint64_t *g = vg; 4822 4823 do { 4824 uint64_t pg = g[(i - 1) >> 6]; 4825 do { 4826 i -= 2; 4827 if (likely((pg >> (i & 63)) & 1)) { 4828 float16 e1, e2, e3, r; 4829 4830 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4831 e2 = *(uint16_t *)(vm + H1_2(i)); 4832 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4833 r = float16_muladd(e1, e2, e3, flags, status); 4834 *(uint16_t *)(vd + H1_2(i)) = r; 4835 } 4836 } while (i & 63); 4837 } while (i != 0); 4838 } 4839 4840 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4841 void *vg, float_status *status, uint32_t desc) 4842 { 4843 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4844 } 4845 4846 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4847 void *vg, float_status *status, uint32_t desc) 4848 { 4849 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 4850 } 4851 4852 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4853 void *vg, float_status *status, uint32_t desc) 4854 { 4855 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 4856 } 4857 4858 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4859 void *vg, float_status *status, uint32_t desc) 4860 { 4861 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 4862 } 4863 4864 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4865 void *vg, float_status *status, uint32_t desc) 4866 { 4867 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4868 float_muladd_negate_product); 4869 } 4870 4871 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4872 void *vg, float_status *status, uint32_t desc) 4873 { 4874 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4875 float_muladd_negate_product | float_muladd_negate_c); 4876 } 4877 4878 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4879 void *vg, float_status *status, uint32_t desc) 4880 { 4881 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4882 float_muladd_negate_c); 4883 } 4884 4885 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4886 float_status *status, uint32_t desc, 4887 uint32_t neg1, uint32_t neg3, int flags) 4888 { 4889 intptr_t i = simd_oprsz(desc); 4890 uint64_t *g = vg; 4891 4892 do { 4893 uint64_t pg = g[(i - 1) >> 6]; 4894 do { 4895 i -= 4; 4896 if (likely((pg >> (i & 63)) & 1)) { 4897 float32 e1, e2, e3, r; 4898 4899 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4900 e2 = *(uint32_t *)(vm + H1_4(i)); 4901 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4902 r = float32_muladd(e1, e2, e3, flags, status); 4903 *(uint32_t *)(vd + H1_4(i)) = r; 4904 } 4905 } while (i & 63); 4906 } while (i != 0); 4907 } 4908 4909 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4910 void *vg, float_status *status, uint32_t desc) 4911 { 4912 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4913 } 4914 4915 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4916 void *vg, float_status *status, uint32_t desc) 4917 { 4918 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0); 4919 } 4920 4921 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4922 void *vg, float_status *status, uint32_t desc) 4923 { 4924 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0); 4925 } 4926 4927 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4928 void *vg, float_status *status, uint32_t desc) 4929 { 4930 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0); 4931 } 4932 4933 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4934 void *vg, float_status *status, uint32_t desc) 4935 { 4936 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4937 float_muladd_negate_product); 4938 } 4939 4940 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4941 void *vg, float_status *status, uint32_t desc) 4942 { 4943 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4944 float_muladd_negate_product | float_muladd_negate_c); 4945 } 4946 4947 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4948 void *vg, float_status *status, uint32_t desc) 4949 { 4950 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4951 float_muladd_negate_c); 4952 } 4953 4954 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4955 float_status *status, uint32_t desc, 4956 uint64_t neg1, uint64_t neg3, int flags) 4957 { 4958 intptr_t i = simd_oprsz(desc); 4959 uint64_t *g = vg; 4960 4961 do { 4962 uint64_t pg = g[(i - 1) >> 6]; 4963 do { 4964 i -= 8; 4965 if (likely((pg >> (i & 63)) & 1)) { 4966 float64 e1, e2, e3, r; 4967 4968 e1 = *(uint64_t *)(vn + i) ^ neg1; 4969 e2 = *(uint64_t *)(vm + i); 4970 e3 = *(uint64_t *)(va + i) ^ neg3; 4971 r = float64_muladd(e1, e2, e3, flags, status); 4972 *(uint64_t *)(vd + i) = r; 4973 } 4974 } while (i & 63); 4975 } while (i != 0); 4976 } 4977 4978 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4979 void *vg, float_status *status, uint32_t desc) 4980 { 4981 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4982 } 4983 4984 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4985 void *vg, float_status *status, uint32_t desc) 4986 { 4987 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0); 4988 } 4989 4990 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4991 void *vg, float_status *status, uint32_t desc) 4992 { 4993 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0); 4994 } 4995 4996 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4997 void *vg, float_status *status, uint32_t desc) 4998 { 4999 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0); 5000 } 5001 5002 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5003 void *vg, float_status *status, uint32_t desc) 5004 { 5005 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5006 float_muladd_negate_product); 5007 } 5008 5009 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5010 void *vg, float_status *status, uint32_t desc) 5011 { 5012 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5013 float_muladd_negate_product | float_muladd_negate_c); 5014 } 5015 5016 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5017 void *vg, float_status *status, uint32_t desc) 5018 { 5019 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5020 float_muladd_negate_c); 5021 } 5022 5023 /* Two operand floating-point comparison controlled by a predicate. 5024 * Unlike the integer version, we are not allowed to optimistically 5025 * compare operands, since the comparison may have side effects wrt 5026 * the FPSR. 5027 */ 5028 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 5029 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 5030 float_status *status, uint32_t desc) \ 5031 { \ 5032 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5033 uint64_t *d = vd, *g = vg; \ 5034 do { \ 5035 uint64_t out = 0, pg = g[j]; \ 5036 do { \ 5037 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5038 if (likely((pg >> (i & 63)) & 1)) { \ 5039 TYPE nn = *(TYPE *)(vn + H(i)); \ 5040 TYPE mm = *(TYPE *)(vm + H(i)); \ 5041 out |= OP(TYPE, nn, mm, status); \ 5042 } \ 5043 } while (i & 63); \ 5044 d[j--] = out; \ 5045 } while (i > 0); \ 5046 } 5047 5048 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 5049 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 5050 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 5051 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 5052 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 5053 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 5054 5055 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 5056 DO_FPCMP_PPZZ_H(NAME, OP) \ 5057 DO_FPCMP_PPZZ_S(NAME, OP) \ 5058 DO_FPCMP_PPZZ_D(NAME, OP) 5059 5060 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 5061 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 5062 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 5063 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 5064 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 5065 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 5066 #define DO_FCMUO(TYPE, X, Y, ST) \ 5067 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 5068 #define DO_FACGE(TYPE, X, Y, ST) \ 5069 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 5070 #define DO_FACGT(TYPE, X, Y, ST) \ 5071 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 5072 5073 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 5074 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 5075 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 5076 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 5077 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 5078 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 5079 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 5080 5081 #undef DO_FPCMP_PPZZ_ALL 5082 #undef DO_FPCMP_PPZZ_D 5083 #undef DO_FPCMP_PPZZ_S 5084 #undef DO_FPCMP_PPZZ_H 5085 #undef DO_FPCMP_PPZZ 5086 5087 /* One operand floating-point comparison against zero, controlled 5088 * by a predicate. 5089 */ 5090 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 5091 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 5092 float_status *status, uint32_t desc) \ 5093 { \ 5094 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5095 uint64_t *d = vd, *g = vg; \ 5096 do { \ 5097 uint64_t out = 0, pg = g[j]; \ 5098 do { \ 5099 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5100 if ((pg >> (i & 63)) & 1) { \ 5101 TYPE nn = *(TYPE *)(vn + H(i)); \ 5102 out |= OP(TYPE, nn, 0, status); \ 5103 } \ 5104 } while (i & 63); \ 5105 d[j--] = out; \ 5106 } while (i > 0); \ 5107 } 5108 5109 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 5110 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 5111 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 5112 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 5113 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 5114 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 5115 5116 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 5117 DO_FPCMP_PPZ0_H(NAME, OP) \ 5118 DO_FPCMP_PPZ0_S(NAME, OP) \ 5119 DO_FPCMP_PPZ0_D(NAME, OP) 5120 5121 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 5122 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 5123 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 5124 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 5125 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 5126 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 5127 5128 /* FP Trig Multiply-Add. */ 5129 5130 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 5131 float_status *s, uint32_t desc) 5132 { 5133 static const float16 coeff[16] = { 5134 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5135 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5136 }; 5137 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 5138 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5139 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5140 float16 *d = vd, *n = vn, *m = vm; 5141 5142 for (i = 0; i < opr_sz; i++) { 5143 float16 mm = m[i]; 5144 intptr_t xx = x; 5145 int flags = 0; 5146 5147 if (float16_is_neg(mm)) { 5148 if (fpcr_ah) { 5149 flags = float_muladd_negate_product; 5150 } else { 5151 mm = float16_abs(mm); 5152 } 5153 xx += 8; 5154 } 5155 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s); 5156 } 5157 } 5158 5159 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5160 float_status *s, uint32_t desc) 5161 { 5162 static const float32 coeff[16] = { 5163 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5164 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5165 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5166 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5167 }; 5168 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5169 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5170 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5171 float32 *d = vd, *n = vn, *m = vm; 5172 5173 for (i = 0; i < opr_sz; i++) { 5174 float32 mm = m[i]; 5175 intptr_t xx = x; 5176 int flags = 0; 5177 5178 if (float32_is_neg(mm)) { 5179 if (fpcr_ah) { 5180 flags = float_muladd_negate_product; 5181 } else { 5182 mm = float32_abs(mm); 5183 } 5184 xx += 8; 5185 } 5186 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s); 5187 } 5188 } 5189 5190 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5191 float_status *s, uint32_t desc) 5192 { 5193 static const float64 coeff[16] = { 5194 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5195 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5196 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5197 0x3de5d8408868552full, 0x0000000000000000ull, 5198 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5199 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5200 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5201 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5202 }; 5203 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5204 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5205 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5206 float64 *d = vd, *n = vn, *m = vm; 5207 5208 for (i = 0; i < opr_sz; i++) { 5209 float64 mm = m[i]; 5210 intptr_t xx = x; 5211 int flags = 0; 5212 5213 if (float64_is_neg(mm)) { 5214 if (fpcr_ah) { 5215 flags = float_muladd_negate_product; 5216 } else { 5217 mm = float64_abs(mm); 5218 } 5219 xx += 8; 5220 } 5221 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s); 5222 } 5223 } 5224 5225 /* 5226 * FP Complex Add 5227 */ 5228 5229 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5230 float_status *s, uint32_t desc) 5231 { 5232 intptr_t j, i = simd_oprsz(desc); 5233 uint64_t *g = vg; 5234 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5235 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5236 5237 do { 5238 uint64_t pg = g[(i - 1) >> 6]; 5239 do { 5240 float16 e0, e1, e2, e3; 5241 5242 /* I holds the real index; J holds the imag index. */ 5243 j = i - sizeof(float16); 5244 i -= 2 * sizeof(float16); 5245 5246 e0 = *(float16 *)(vn + H1_2(i)); 5247 e1 = *(float16 *)(vm + H1_2(j)); 5248 e2 = *(float16 *)(vn + H1_2(j)); 5249 e3 = *(float16 *)(vm + H1_2(i)); 5250 5251 if (rot) { 5252 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 5253 } else { 5254 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 5255 } 5256 5257 if (likely((pg >> (i & 63)) & 1)) { 5258 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5259 } 5260 if (likely((pg >> (j & 63)) & 1)) { 5261 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5262 } 5263 } while (i & 63); 5264 } while (i != 0); 5265 } 5266 5267 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5268 float_status *s, uint32_t desc) 5269 { 5270 intptr_t j, i = simd_oprsz(desc); 5271 uint64_t *g = vg; 5272 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5273 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5274 5275 do { 5276 uint64_t pg = g[(i - 1) >> 6]; 5277 do { 5278 float32 e0, e1, e2, e3; 5279 5280 /* I holds the real index; J holds the imag index. */ 5281 j = i - sizeof(float32); 5282 i -= 2 * sizeof(float32); 5283 5284 e0 = *(float32 *)(vn + H1_2(i)); 5285 e1 = *(float32 *)(vm + H1_2(j)); 5286 e2 = *(float32 *)(vn + H1_2(j)); 5287 e3 = *(float32 *)(vm + H1_2(i)); 5288 5289 if (rot) { 5290 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 5291 } else { 5292 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 5293 } 5294 5295 if (likely((pg >> (i & 63)) & 1)) { 5296 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5297 } 5298 if (likely((pg >> (j & 63)) & 1)) { 5299 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5300 } 5301 } while (i & 63); 5302 } while (i != 0); 5303 } 5304 5305 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5306 float_status *s, uint32_t desc) 5307 { 5308 intptr_t j, i = simd_oprsz(desc); 5309 uint64_t *g = vg; 5310 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5311 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5312 5313 do { 5314 uint64_t pg = g[(i - 1) >> 6]; 5315 do { 5316 float64 e0, e1, e2, e3; 5317 5318 /* I holds the real index; J holds the imag index. */ 5319 j = i - sizeof(float64); 5320 i -= 2 * sizeof(float64); 5321 5322 e0 = *(float64 *)(vn + H1_2(i)); 5323 e1 = *(float64 *)(vm + H1_2(j)); 5324 e2 = *(float64 *)(vn + H1_2(j)); 5325 e3 = *(float64 *)(vm + H1_2(i)); 5326 5327 if (rot) { 5328 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 5329 } else { 5330 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 5331 } 5332 5333 if (likely((pg >> (i & 63)) & 1)) { 5334 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5335 } 5336 if (likely((pg >> (j & 63)) & 1)) { 5337 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5338 } 5339 } while (i & 63); 5340 } while (i != 0); 5341 } 5342 5343 /* 5344 * FP Complex Multiply 5345 */ 5346 5347 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5348 void *vg, float_status *status, uint32_t desc) 5349 { 5350 intptr_t j, i = simd_oprsz(desc); 5351 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5352 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5353 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5354 uint32_t negf_real = flip ^ negf_imag; 5355 float16 negx_imag, negx_real; 5356 uint64_t *g = vg; 5357 5358 /* With AH=0, use negx; with AH=1 use negf. */ 5359 negx_real = (negf_real & ~fpcr_ah) << 15; 5360 negx_imag = (negf_imag & ~fpcr_ah) << 15; 5361 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5362 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5363 5364 do { 5365 uint64_t pg = g[(i - 1) >> 6]; 5366 do { 5367 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5368 5369 /* I holds the real index; J holds the imag index. */ 5370 j = i - sizeof(float16); 5371 i -= 2 * sizeof(float16); 5372 5373 nr = *(float16 *)(vn + H1_2(i)); 5374 ni = *(float16 *)(vn + H1_2(j)); 5375 mr = *(float16 *)(vm + H1_2(i)); 5376 mi = *(float16 *)(vm + H1_2(j)); 5377 5378 e2 = (flip ? ni : nr); 5379 e1 = (flip ? mi : mr) ^ negx_real; 5380 e4 = e2; 5381 e3 = (flip ? mr : mi) ^ negx_imag; 5382 5383 if (likely((pg >> (i & 63)) & 1)) { 5384 d = *(float16 *)(va + H1_2(i)); 5385 d = float16_muladd(e2, e1, d, negf_real, status); 5386 *(float16 *)(vd + H1_2(i)) = d; 5387 } 5388 if (likely((pg >> (j & 63)) & 1)) { 5389 d = *(float16 *)(va + H1_2(j)); 5390 d = float16_muladd(e4, e3, d, negf_imag, status); 5391 *(float16 *)(vd + H1_2(j)) = d; 5392 } 5393 } while (i & 63); 5394 } while (i != 0); 5395 } 5396 5397 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5398 void *vg, float_status *status, uint32_t desc) 5399 { 5400 intptr_t j, i = simd_oprsz(desc); 5401 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5402 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5403 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5404 uint32_t negf_real = flip ^ negf_imag; 5405 float32 negx_imag, negx_real; 5406 uint64_t *g = vg; 5407 5408 /* With AH=0, use negx; with AH=1 use negf. */ 5409 negx_real = (negf_real & ~fpcr_ah) << 31; 5410 negx_imag = (negf_imag & ~fpcr_ah) << 31; 5411 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5412 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5413 5414 do { 5415 uint64_t pg = g[(i - 1) >> 6]; 5416 do { 5417 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5418 5419 /* I holds the real index; J holds the imag index. */ 5420 j = i - sizeof(float32); 5421 i -= 2 * sizeof(float32); 5422 5423 nr = *(float32 *)(vn + H1_2(i)); 5424 ni = *(float32 *)(vn + H1_2(j)); 5425 mr = *(float32 *)(vm + H1_2(i)); 5426 mi = *(float32 *)(vm + H1_2(j)); 5427 5428 e2 = (flip ? ni : nr); 5429 e1 = (flip ? mi : mr) ^ negx_real; 5430 e4 = e2; 5431 e3 = (flip ? mr : mi) ^ negx_imag; 5432 5433 if (likely((pg >> (i & 63)) & 1)) { 5434 d = *(float32 *)(va + H1_2(i)); 5435 d = float32_muladd(e2, e1, d, negf_real, status); 5436 *(float32 *)(vd + H1_2(i)) = d; 5437 } 5438 if (likely((pg >> (j & 63)) & 1)) { 5439 d = *(float32 *)(va + H1_2(j)); 5440 d = float32_muladd(e4, e3, d, negf_imag, status); 5441 *(float32 *)(vd + H1_2(j)) = d; 5442 } 5443 } while (i & 63); 5444 } while (i != 0); 5445 } 5446 5447 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5448 void *vg, float_status *status, uint32_t desc) 5449 { 5450 intptr_t j, i = simd_oprsz(desc); 5451 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5452 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5453 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5454 uint32_t negf_real = flip ^ negf_imag; 5455 float64 negx_imag, negx_real; 5456 uint64_t *g = vg; 5457 5458 /* With AH=0, use negx; with AH=1 use negf. */ 5459 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 5460 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 5461 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5462 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5463 5464 do { 5465 uint64_t pg = g[(i - 1) >> 6]; 5466 do { 5467 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5468 5469 /* I holds the real index; J holds the imag index. */ 5470 j = i - sizeof(float64); 5471 i -= 2 * sizeof(float64); 5472 5473 nr = *(float64 *)(vn + H1_2(i)); 5474 ni = *(float64 *)(vn + H1_2(j)); 5475 mr = *(float64 *)(vm + H1_2(i)); 5476 mi = *(float64 *)(vm + H1_2(j)); 5477 5478 e2 = (flip ? ni : nr); 5479 e1 = (flip ? mi : mr) ^ negx_real; 5480 e4 = e2; 5481 e3 = (flip ? mr : mi) ^ negx_imag; 5482 5483 if (likely((pg >> (i & 63)) & 1)) { 5484 d = *(float64 *)(va + H1_2(i)); 5485 d = float64_muladd(e2, e1, d, negf_real, status); 5486 *(float64 *)(vd + H1_2(i)) = d; 5487 } 5488 if (likely((pg >> (j & 63)) & 1)) { 5489 d = *(float64 *)(va + H1_2(j)); 5490 d = float64_muladd(e4, e3, d, negf_imag, status); 5491 *(float64 *)(vd + H1_2(j)) = d; 5492 } 5493 } while (i & 63); 5494 } while (i != 0); 5495 } 5496 5497 /* 5498 * Load contiguous data, protected by a governing predicate. 5499 */ 5500 5501 /* 5502 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5503 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5504 * element >= @reg_off, or @reg_max if there were no active elements at all. 5505 */ 5506 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5507 intptr_t reg_max, int esz) 5508 { 5509 uint64_t pg_mask = pred_esz_masks[esz]; 5510 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5511 5512 /* In normal usage, the first element is active. */ 5513 if (likely(pg & 1)) { 5514 return reg_off; 5515 } 5516 5517 if (pg == 0) { 5518 reg_off &= -64; 5519 do { 5520 reg_off += 64; 5521 if (unlikely(reg_off >= reg_max)) { 5522 /* The entire predicate was false. */ 5523 return reg_max; 5524 } 5525 pg = vg[reg_off >> 6] & pg_mask; 5526 } while (pg == 0); 5527 } 5528 reg_off += ctz64(pg); 5529 5530 /* We should never see an out of range predicate bit set. */ 5531 tcg_debug_assert(reg_off < reg_max); 5532 return reg_off; 5533 } 5534 5535 /* 5536 * Resolve the guest virtual address to info->host and info->flags. 5537 * If @nofault, return false if the page is invalid, otherwise 5538 * exit via page fault exception. 5539 */ 5540 5541 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5542 target_ulong addr, int mem_off, MMUAccessType access_type, 5543 int mmu_idx, uintptr_t retaddr) 5544 { 5545 int flags; 5546 5547 addr += mem_off; 5548 5549 /* 5550 * User-only currently always issues with TBI. See the comment 5551 * above useronly_clean_ptr. Usually we clean this top byte away 5552 * during translation, but we can't do that for e.g. vector + imm 5553 * addressing modes. 5554 * 5555 * We currently always enable TBI for user-only, and do not provide 5556 * a way to turn it off. So clean the pointer unconditionally here, 5557 * rather than look it up here, or pass it down from above. 5558 */ 5559 addr = useronly_clean_ptr(addr); 5560 5561 #ifdef CONFIG_USER_ONLY 5562 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5563 &info->host, retaddr); 5564 #else 5565 CPUTLBEntryFull *full; 5566 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5567 &info->host, &full, retaddr); 5568 #endif 5569 info->flags = flags; 5570 5571 if (flags & TLB_INVALID_MASK) { 5572 g_assert(nofault); 5573 return false; 5574 } 5575 5576 #ifdef CONFIG_USER_ONLY 5577 memset(&info->attrs, 0, sizeof(info->attrs)); 5578 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5579 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5580 #else 5581 info->attrs = full->attrs; 5582 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5583 #endif 5584 5585 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5586 info->host -= mem_off; 5587 return true; 5588 } 5589 5590 /* 5591 * Find first active element on each page, and a loose bound for the 5592 * final element on each page. Identify any single element that spans 5593 * the page boundary. Return true if there are any active elements. 5594 */ 5595 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5596 intptr_t reg_max, int esz, int msize) 5597 { 5598 const int esize = 1 << esz; 5599 const uint64_t pg_mask = pred_esz_masks[esz]; 5600 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5601 intptr_t mem_off_last, mem_off_split; 5602 intptr_t page_split, elt_split; 5603 intptr_t i; 5604 5605 /* Set all of the element indices to -1, and the TLB data to 0. */ 5606 memset(info, -1, offsetof(SVEContLdSt, page)); 5607 memset(info->page, 0, sizeof(info->page)); 5608 5609 /* Gross scan over the entire predicate to find bounds. */ 5610 i = 0; 5611 do { 5612 uint64_t pg = vg[i] & pg_mask; 5613 if (pg) { 5614 reg_off_last = i * 64 + 63 - clz64(pg); 5615 if (reg_off_first < 0) { 5616 reg_off_first = i * 64 + ctz64(pg); 5617 } 5618 } 5619 } while (++i * 64 < reg_max); 5620 5621 if (unlikely(reg_off_first < 0)) { 5622 /* No active elements, no pages touched. */ 5623 return false; 5624 } 5625 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5626 5627 info->reg_off_first[0] = reg_off_first; 5628 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5629 mem_off_last = (reg_off_last >> esz) * msize; 5630 5631 page_split = -(addr | TARGET_PAGE_MASK); 5632 if (likely(mem_off_last + msize <= page_split)) { 5633 /* The entire operation fits within a single page. */ 5634 info->reg_off_last[0] = reg_off_last; 5635 return true; 5636 } 5637 5638 info->page_split = page_split; 5639 elt_split = page_split / msize; 5640 reg_off_split = elt_split << esz; 5641 mem_off_split = elt_split * msize; 5642 5643 /* 5644 * This is the last full element on the first page, but it is not 5645 * necessarily active. If there is no full element, i.e. the first 5646 * active element is the one that's split, this value remains -1. 5647 * It is useful as iteration bounds. 5648 */ 5649 if (elt_split != 0) { 5650 info->reg_off_last[0] = reg_off_split - esize; 5651 } 5652 5653 /* Determine if an unaligned element spans the pages. */ 5654 if (page_split % msize != 0) { 5655 /* It is helpful to know if the split element is active. */ 5656 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5657 info->reg_off_split = reg_off_split; 5658 info->mem_off_split = mem_off_split; 5659 5660 if (reg_off_split == reg_off_last) { 5661 /* The page crossing element is last. */ 5662 return true; 5663 } 5664 } 5665 reg_off_split += esize; 5666 mem_off_split += msize; 5667 } 5668 5669 /* 5670 * We do want the first active element on the second page, because 5671 * this may affect the address reported in an exception. 5672 */ 5673 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5674 tcg_debug_assert(reg_off_split <= reg_off_last); 5675 info->reg_off_first[1] = reg_off_split; 5676 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5677 info->reg_off_last[1] = reg_off_last; 5678 return true; 5679 } 5680 5681 /* 5682 * Resolve the guest virtual addresses to info->page[]. 5683 * Control the generation of page faults with @fault. Return false if 5684 * there is no work to do, which can only happen with @fault == FAULT_NO. 5685 */ 5686 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5687 CPUARMState *env, target_ulong addr, 5688 MMUAccessType access_type, uintptr_t retaddr) 5689 { 5690 int mmu_idx = arm_env_mmu_index(env); 5691 int mem_off = info->mem_off_first[0]; 5692 bool nofault = fault == FAULT_NO; 5693 bool have_work = true; 5694 5695 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5696 access_type, mmu_idx, retaddr)) { 5697 /* No work to be done. */ 5698 return false; 5699 } 5700 5701 if (likely(info->page_split < 0)) { 5702 /* The entire operation was on the one page. */ 5703 return true; 5704 } 5705 5706 /* 5707 * If the second page is invalid, then we want the fault address to be 5708 * the first byte on that page which is accessed. 5709 */ 5710 if (info->mem_off_split >= 0) { 5711 /* 5712 * There is an element split across the pages. The fault address 5713 * should be the first byte of the second page. 5714 */ 5715 mem_off = info->page_split; 5716 /* 5717 * If the split element is also the first active element 5718 * of the vector, then: For first-fault we should continue 5719 * to generate faults for the second page. For no-fault, 5720 * we have work only if the second page is valid. 5721 */ 5722 if (info->mem_off_first[0] < info->mem_off_split) { 5723 nofault = FAULT_FIRST; 5724 have_work = false; 5725 } 5726 } else { 5727 /* 5728 * There is no element split across the pages. The fault address 5729 * should be the first active element on the second page. 5730 */ 5731 mem_off = info->mem_off_first[1]; 5732 /* 5733 * There must have been one active element on the first page, 5734 * so we're out of first-fault territory. 5735 */ 5736 nofault = fault != FAULT_ALL; 5737 } 5738 5739 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5740 access_type, mmu_idx, retaddr); 5741 return have_work; 5742 } 5743 5744 #ifndef CONFIG_USER_ONLY 5745 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5746 uint64_t *vg, target_ulong addr, 5747 int esize, int msize, int wp_access, 5748 uintptr_t retaddr) 5749 { 5750 intptr_t mem_off, reg_off, reg_last; 5751 int flags0 = info->page[0].flags; 5752 int flags1 = info->page[1].flags; 5753 5754 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5755 return; 5756 } 5757 5758 /* Indicate that watchpoints are handled. */ 5759 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5760 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5761 5762 if (flags0 & TLB_WATCHPOINT) { 5763 mem_off = info->mem_off_first[0]; 5764 reg_off = info->reg_off_first[0]; 5765 reg_last = info->reg_off_last[0]; 5766 5767 while (reg_off <= reg_last) { 5768 uint64_t pg = vg[reg_off >> 6]; 5769 do { 5770 if ((pg >> (reg_off & 63)) & 1) { 5771 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5772 msize, info->page[0].attrs, 5773 wp_access, retaddr); 5774 } 5775 reg_off += esize; 5776 mem_off += msize; 5777 } while (reg_off <= reg_last && (reg_off & 63)); 5778 } 5779 } 5780 5781 mem_off = info->mem_off_split; 5782 if (mem_off >= 0) { 5783 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5784 info->page[0].attrs, wp_access, retaddr); 5785 } 5786 5787 mem_off = info->mem_off_first[1]; 5788 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5789 reg_off = info->reg_off_first[1]; 5790 reg_last = info->reg_off_last[1]; 5791 5792 do { 5793 uint64_t pg = vg[reg_off >> 6]; 5794 do { 5795 if ((pg >> (reg_off & 63)) & 1) { 5796 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5797 msize, info->page[1].attrs, 5798 wp_access, retaddr); 5799 } 5800 reg_off += esize; 5801 mem_off += msize; 5802 } while (reg_off & 63); 5803 } while (reg_off <= reg_last); 5804 } 5805 } 5806 #endif 5807 5808 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5809 uint64_t *vg, target_ulong addr, int esize, 5810 int msize, uint32_t mtedesc, uintptr_t ra) 5811 { 5812 intptr_t mem_off, reg_off, reg_last; 5813 5814 /* Process the page only if MemAttr == Tagged. */ 5815 if (info->page[0].tagged) { 5816 mem_off = info->mem_off_first[0]; 5817 reg_off = info->reg_off_first[0]; 5818 reg_last = info->reg_off_split; 5819 if (reg_last < 0) { 5820 reg_last = info->reg_off_last[0]; 5821 } 5822 5823 do { 5824 uint64_t pg = vg[reg_off >> 6]; 5825 do { 5826 if ((pg >> (reg_off & 63)) & 1) { 5827 mte_check(env, mtedesc, addr, ra); 5828 } 5829 reg_off += esize; 5830 mem_off += msize; 5831 } while (reg_off <= reg_last && (reg_off & 63)); 5832 } while (reg_off <= reg_last); 5833 } 5834 5835 mem_off = info->mem_off_first[1]; 5836 if (mem_off >= 0 && info->page[1].tagged) { 5837 reg_off = info->reg_off_first[1]; 5838 reg_last = info->reg_off_last[1]; 5839 5840 do { 5841 uint64_t pg = vg[reg_off >> 6]; 5842 do { 5843 if ((pg >> (reg_off & 63)) & 1) { 5844 mte_check(env, mtedesc, addr, ra); 5845 } 5846 reg_off += esize; 5847 mem_off += msize; 5848 } while (reg_off & 63); 5849 } while (reg_off <= reg_last); 5850 } 5851 } 5852 5853 /* 5854 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5855 */ 5856 static inline QEMU_ALWAYS_INLINE 5857 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5858 uint32_t desc, const uintptr_t retaddr, 5859 const int esz, const int msz, const int N, uint32_t mtedesc, 5860 sve_ldst1_host_fn *host_fn, 5861 sve_ldst1_tlb_fn *tlb_fn) 5862 { 5863 const unsigned rd = simd_data(desc); 5864 const intptr_t reg_max = simd_oprsz(desc); 5865 intptr_t reg_off, reg_last, mem_off; 5866 SVEContLdSt info; 5867 void *host; 5868 int flags, i; 5869 5870 /* Find the active elements. */ 5871 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5872 /* The entire predicate was false; no load occurs. */ 5873 for (i = 0; i < N; ++i) { 5874 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5875 } 5876 return; 5877 } 5878 5879 /* Probe the page(s). Exit with exception for any invalid page. */ 5880 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5881 5882 /* Handle watchpoints for all active elements. */ 5883 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5884 BP_MEM_READ, retaddr); 5885 5886 /* 5887 * Handle mte checks for all active elements. 5888 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5889 */ 5890 if (mtedesc) { 5891 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5892 mtedesc, retaddr); 5893 } 5894 5895 flags = info.page[0].flags | info.page[1].flags; 5896 if (unlikely(flags != 0)) { 5897 /* 5898 * At least one page includes MMIO. 5899 * Any bus operation can fail with cpu_transaction_failed, 5900 * which for ARM will raise SyncExternal. Perform the load 5901 * into scratch memory to preserve register state until the end. 5902 */ 5903 ARMVectorReg scratch[4] = { }; 5904 5905 mem_off = info.mem_off_first[0]; 5906 reg_off = info.reg_off_first[0]; 5907 reg_last = info.reg_off_last[1]; 5908 if (reg_last < 0) { 5909 reg_last = info.reg_off_split; 5910 if (reg_last < 0) { 5911 reg_last = info.reg_off_last[0]; 5912 } 5913 } 5914 5915 do { 5916 uint64_t pg = vg[reg_off >> 6]; 5917 do { 5918 if ((pg >> (reg_off & 63)) & 1) { 5919 for (i = 0; i < N; ++i) { 5920 tlb_fn(env, &scratch[i], reg_off, 5921 addr + mem_off + (i << msz), retaddr); 5922 } 5923 } 5924 reg_off += 1 << esz; 5925 mem_off += N << msz; 5926 } while (reg_off & 63); 5927 } while (reg_off <= reg_last); 5928 5929 for (i = 0; i < N; ++i) { 5930 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5931 } 5932 return; 5933 } 5934 5935 /* The entire operation is in RAM, on valid pages. */ 5936 5937 for (i = 0; i < N; ++i) { 5938 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5939 } 5940 5941 mem_off = info.mem_off_first[0]; 5942 reg_off = info.reg_off_first[0]; 5943 reg_last = info.reg_off_last[0]; 5944 host = info.page[0].host; 5945 5946 set_helper_retaddr(retaddr); 5947 5948 while (reg_off <= reg_last) { 5949 uint64_t pg = vg[reg_off >> 6]; 5950 do { 5951 if ((pg >> (reg_off & 63)) & 1) { 5952 for (i = 0; i < N; ++i) { 5953 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5954 host + mem_off + (i << msz)); 5955 } 5956 } 5957 reg_off += 1 << esz; 5958 mem_off += N << msz; 5959 } while (reg_off <= reg_last && (reg_off & 63)); 5960 } 5961 5962 clear_helper_retaddr(); 5963 5964 /* 5965 * Use the slow path to manage the cross-page misalignment. 5966 * But we know this is RAM and cannot trap. 5967 */ 5968 mem_off = info.mem_off_split; 5969 if (unlikely(mem_off >= 0)) { 5970 reg_off = info.reg_off_split; 5971 for (i = 0; i < N; ++i) { 5972 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5973 addr + mem_off + (i << msz), retaddr); 5974 } 5975 } 5976 5977 mem_off = info.mem_off_first[1]; 5978 if (unlikely(mem_off >= 0)) { 5979 reg_off = info.reg_off_first[1]; 5980 reg_last = info.reg_off_last[1]; 5981 host = info.page[1].host; 5982 5983 set_helper_retaddr(retaddr); 5984 5985 do { 5986 uint64_t pg = vg[reg_off >> 6]; 5987 do { 5988 if ((pg >> (reg_off & 63)) & 1) { 5989 for (i = 0; i < N; ++i) { 5990 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5991 host + mem_off + (i << msz)); 5992 } 5993 } 5994 reg_off += 1 << esz; 5995 mem_off += N << msz; 5996 } while (reg_off & 63); 5997 } while (reg_off <= reg_last); 5998 5999 clear_helper_retaddr(); 6000 } 6001 } 6002 6003 static inline QEMU_ALWAYS_INLINE 6004 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6005 uint32_t desc, const uintptr_t ra, 6006 const int esz, const int msz, const int N, 6007 sve_ldst1_host_fn *host_fn, 6008 sve_ldst1_tlb_fn *tlb_fn) 6009 { 6010 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6011 int bit55 = extract64(addr, 55, 1); 6012 6013 /* Remove mtedesc from the normal sve descriptor. */ 6014 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6015 6016 /* Perform gross MTE suppression early. */ 6017 if (!tbi_check(mtedesc, bit55) || 6018 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6019 mtedesc = 0; 6020 } 6021 6022 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6023 } 6024 6025 #define DO_LD1_1(NAME, ESZ) \ 6026 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 6027 target_ulong addr, uint32_t desc) \ 6028 { \ 6029 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 6030 sve_##NAME##_host, sve_##NAME##_tlb); \ 6031 } \ 6032 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6033 target_ulong addr, uint32_t desc) \ 6034 { \ 6035 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 6036 sve_##NAME##_host, sve_##NAME##_tlb); \ 6037 } 6038 6039 #define DO_LD1_2(NAME, ESZ, MSZ) \ 6040 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 6041 target_ulong addr, uint32_t desc) \ 6042 { \ 6043 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6044 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6045 } \ 6046 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 6047 target_ulong addr, uint32_t desc) \ 6048 { \ 6049 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6050 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6051 } \ 6052 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6053 target_ulong addr, uint32_t desc) \ 6054 { \ 6055 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6056 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6057 } \ 6058 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6059 target_ulong addr, uint32_t desc) \ 6060 { \ 6061 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6062 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6063 } 6064 6065 DO_LD1_1(ld1bb, MO_8) 6066 DO_LD1_1(ld1bhu, MO_16) 6067 DO_LD1_1(ld1bhs, MO_16) 6068 DO_LD1_1(ld1bsu, MO_32) 6069 DO_LD1_1(ld1bss, MO_32) 6070 DO_LD1_1(ld1bdu, MO_64) 6071 DO_LD1_1(ld1bds, MO_64) 6072 6073 DO_LD1_2(ld1hh, MO_16, MO_16) 6074 DO_LD1_2(ld1hsu, MO_32, MO_16) 6075 DO_LD1_2(ld1hss, MO_32, MO_16) 6076 DO_LD1_2(ld1hdu, MO_64, MO_16) 6077 DO_LD1_2(ld1hds, MO_64, MO_16) 6078 6079 DO_LD1_2(ld1ss, MO_32, MO_32) 6080 DO_LD1_2(ld1sdu, MO_64, MO_32) 6081 DO_LD1_2(ld1sds, MO_64, MO_32) 6082 6083 DO_LD1_2(ld1dd, MO_64, MO_64) 6084 6085 #undef DO_LD1_1 6086 #undef DO_LD1_2 6087 6088 #define DO_LDN_1(N) \ 6089 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 6090 target_ulong addr, uint32_t desc) \ 6091 { \ 6092 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 6093 sve_ld1bb_host, sve_ld1bb_tlb); \ 6094 } \ 6095 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 6096 target_ulong addr, uint32_t desc) \ 6097 { \ 6098 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 6099 sve_ld1bb_host, sve_ld1bb_tlb); \ 6100 } 6101 6102 #define DO_LDN_2(N, SUFF, ESZ) \ 6103 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 6104 target_ulong addr, uint32_t desc) \ 6105 { \ 6106 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6107 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6108 } \ 6109 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 6110 target_ulong addr, uint32_t desc) \ 6111 { \ 6112 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6113 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6114 } \ 6115 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 6116 target_ulong addr, uint32_t desc) \ 6117 { \ 6118 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6119 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6120 } \ 6121 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 6122 target_ulong addr, uint32_t desc) \ 6123 { \ 6124 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6125 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6126 } 6127 6128 DO_LDN_1(2) 6129 DO_LDN_1(3) 6130 DO_LDN_1(4) 6131 6132 DO_LDN_2(2, hh, MO_16) 6133 DO_LDN_2(3, hh, MO_16) 6134 DO_LDN_2(4, hh, MO_16) 6135 6136 DO_LDN_2(2, ss, MO_32) 6137 DO_LDN_2(3, ss, MO_32) 6138 DO_LDN_2(4, ss, MO_32) 6139 6140 DO_LDN_2(2, dd, MO_64) 6141 DO_LDN_2(3, dd, MO_64) 6142 DO_LDN_2(4, dd, MO_64) 6143 6144 #undef DO_LDN_1 6145 #undef DO_LDN_2 6146 6147 /* 6148 * Load contiguous data, first-fault and no-fault. 6149 * 6150 * For user-only, we control the race between page_check_range and 6151 * another thread's munmap by using set/clear_helper_retaddr. Any 6152 * SEGV that occurs between those markers is assumed to be because 6153 * the guest page vanished. Keep that block as small as possible 6154 * so that unrelated QEMU bugs are not blamed on the guest. 6155 */ 6156 6157 /* Fault on byte I. All bits in FFR from I are cleared. The vector 6158 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 6159 * option, which leaves subsequent data unchanged. 6160 */ 6161 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 6162 { 6163 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 6164 6165 if (i & 63) { 6166 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 6167 i = ROUND_UP(i, 64); 6168 } 6169 for (; i < oprsz; i += 64) { 6170 ffr[i / 64] = 0; 6171 } 6172 } 6173 6174 /* 6175 * Common helper for all contiguous no-fault and first-fault loads. 6176 */ 6177 static inline QEMU_ALWAYS_INLINE 6178 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 6179 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 6180 const int esz, const int msz, const SVEContFault fault, 6181 sve_ldst1_host_fn *host_fn, 6182 sve_ldst1_tlb_fn *tlb_fn) 6183 { 6184 const unsigned rd = simd_data(desc); 6185 void *vd = &env->vfp.zregs[rd]; 6186 const intptr_t reg_max = simd_oprsz(desc); 6187 intptr_t reg_off, mem_off, reg_last; 6188 SVEContLdSt info; 6189 int flags; 6190 void *host; 6191 6192 /* Find the active elements. */ 6193 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 6194 /* The entire predicate was false; no load occurs. */ 6195 memset(vd, 0, reg_max); 6196 return; 6197 } 6198 reg_off = info.reg_off_first[0]; 6199 6200 /* Probe the page(s). */ 6201 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6202 /* Fault on first element. */ 6203 tcg_debug_assert(fault == FAULT_NO); 6204 memset(vd, 0, reg_max); 6205 goto do_fault; 6206 } 6207 6208 mem_off = info.mem_off_first[0]; 6209 flags = info.page[0].flags; 6210 6211 /* 6212 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6213 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6214 */ 6215 if (!info.page[0].tagged) { 6216 mtedesc = 0; 6217 } 6218 6219 if (fault == FAULT_FIRST) { 6220 /* Trapping mte check for the first-fault element. */ 6221 if (mtedesc) { 6222 mte_check(env, mtedesc, addr + mem_off, retaddr); 6223 } 6224 6225 /* 6226 * Special handling of the first active element, 6227 * if it crosses a page boundary or is MMIO. 6228 */ 6229 bool is_split = mem_off == info.mem_off_split; 6230 if (unlikely(flags != 0) || unlikely(is_split)) { 6231 /* 6232 * Use the slow path for cross-page handling. 6233 * Might trap for MMIO or watchpoints. 6234 */ 6235 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6236 6237 /* After any fault, zero the other elements. */ 6238 swap_memzero(vd, reg_off); 6239 reg_off += 1 << esz; 6240 mem_off += 1 << msz; 6241 swap_memzero(vd + reg_off, reg_max - reg_off); 6242 6243 if (is_split) { 6244 goto second_page; 6245 } 6246 } else { 6247 memset(vd, 0, reg_max); 6248 } 6249 } else { 6250 memset(vd, 0, reg_max); 6251 if (unlikely(mem_off == info.mem_off_split)) { 6252 /* The first active element crosses a page boundary. */ 6253 flags |= info.page[1].flags; 6254 if (unlikely(flags & TLB_MMIO)) { 6255 /* Some page is MMIO, see below. */ 6256 goto do_fault; 6257 } 6258 if (unlikely(flags & TLB_WATCHPOINT) && 6259 (cpu_watchpoint_address_matches 6260 (env_cpu(env), addr + mem_off, 1 << msz) 6261 & BP_MEM_READ)) { 6262 /* Watchpoint hit, see below. */ 6263 goto do_fault; 6264 } 6265 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6266 goto do_fault; 6267 } 6268 /* 6269 * Use the slow path for cross-page handling. 6270 * This is RAM, without a watchpoint, and will not trap. 6271 */ 6272 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6273 goto second_page; 6274 } 6275 } 6276 6277 /* 6278 * From this point on, all memory operations are MemSingleNF. 6279 * 6280 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6281 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6282 * 6283 * Unfortuately we do not have access to the memory attributes from the 6284 * PTE to tell Device memory from Normal memory. So we make a mostly 6285 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6286 * This gives the right answer for the common cases of "Normal memory, 6287 * backed by host RAM" and "Device memory, backed by MMIO". 6288 * The architecture allows us to suppress an NF load and return 6289 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6290 * case of "Normal memory, backed by MMIO" is permitted. The case we 6291 * get wrong is "Device memory, backed by host RAM", for which we 6292 * should return (UNKNOWN, FAULT) for but do not. 6293 * 6294 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6295 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6296 * architectural breakpoints the same. 6297 */ 6298 if (unlikely(flags & TLB_MMIO)) { 6299 goto do_fault; 6300 } 6301 6302 reg_last = info.reg_off_last[0]; 6303 host = info.page[0].host; 6304 6305 set_helper_retaddr(retaddr); 6306 6307 do { 6308 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6309 do { 6310 if ((pg >> (reg_off & 63)) & 1) { 6311 if (unlikely(flags & TLB_WATCHPOINT) && 6312 (cpu_watchpoint_address_matches 6313 (env_cpu(env), addr + mem_off, 1 << msz) 6314 & BP_MEM_READ)) { 6315 clear_helper_retaddr(); 6316 goto do_fault; 6317 } 6318 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6319 clear_helper_retaddr(); 6320 goto do_fault; 6321 } 6322 host_fn(vd, reg_off, host + mem_off); 6323 } 6324 reg_off += 1 << esz; 6325 mem_off += 1 << msz; 6326 } while (reg_off <= reg_last && (reg_off & 63)); 6327 } while (reg_off <= reg_last); 6328 6329 clear_helper_retaddr(); 6330 6331 /* 6332 * MemSingleNF is allowed to fail for any reason. We have special 6333 * code above to handle the first element crossing a page boundary. 6334 * As an implementation choice, decline to handle a cross-page element 6335 * in any other position. 6336 */ 6337 reg_off = info.reg_off_split; 6338 if (reg_off >= 0) { 6339 goto do_fault; 6340 } 6341 6342 second_page: 6343 reg_off = info.reg_off_first[1]; 6344 if (likely(reg_off < 0)) { 6345 /* No active elements on the second page. All done. */ 6346 return; 6347 } 6348 6349 /* 6350 * MemSingleNF is allowed to fail for any reason. As an implementation 6351 * choice, decline to handle elements on the second page. This should 6352 * be low frequency as the guest walks through memory -- the next 6353 * iteration of the guest's loop should be aligned on the page boundary, 6354 * and then all following iterations will stay aligned. 6355 */ 6356 6357 do_fault: 6358 record_fault(env, reg_off, reg_max); 6359 } 6360 6361 static inline QEMU_ALWAYS_INLINE 6362 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6363 uint32_t desc, const uintptr_t retaddr, 6364 const int esz, const int msz, const SVEContFault fault, 6365 sve_ldst1_host_fn *host_fn, 6366 sve_ldst1_tlb_fn *tlb_fn) 6367 { 6368 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6369 int bit55 = extract64(addr, 55, 1); 6370 6371 /* Remove mtedesc from the normal sve descriptor. */ 6372 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6373 6374 /* Perform gross MTE suppression early. */ 6375 if (!tbi_check(mtedesc, bit55) || 6376 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6377 mtedesc = 0; 6378 } 6379 6380 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6381 esz, msz, fault, host_fn, tlb_fn); 6382 } 6383 6384 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6385 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6386 target_ulong addr, uint32_t desc) \ 6387 { \ 6388 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6389 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6390 } \ 6391 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6392 target_ulong addr, uint32_t desc) \ 6393 { \ 6394 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6395 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6396 } \ 6397 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6398 target_ulong addr, uint32_t desc) \ 6399 { \ 6400 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6401 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6402 } \ 6403 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6404 target_ulong addr, uint32_t desc) \ 6405 { \ 6406 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6407 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6408 } 6409 6410 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6411 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6412 target_ulong addr, uint32_t desc) \ 6413 { \ 6414 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6415 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6416 } \ 6417 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6418 target_ulong addr, uint32_t desc) \ 6419 { \ 6420 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6421 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6422 } \ 6423 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6424 target_ulong addr, uint32_t desc) \ 6425 { \ 6426 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6427 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6428 } \ 6429 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6430 target_ulong addr, uint32_t desc) \ 6431 { \ 6432 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6433 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6434 } \ 6435 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6436 target_ulong addr, uint32_t desc) \ 6437 { \ 6438 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6439 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6440 } \ 6441 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6442 target_ulong addr, uint32_t desc) \ 6443 { \ 6444 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6445 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6446 } \ 6447 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6448 target_ulong addr, uint32_t desc) \ 6449 { \ 6450 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6451 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6452 } \ 6453 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6454 target_ulong addr, uint32_t desc) \ 6455 { \ 6456 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6457 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6458 } 6459 6460 DO_LDFF1_LDNF1_1(bb, MO_8) 6461 DO_LDFF1_LDNF1_1(bhu, MO_16) 6462 DO_LDFF1_LDNF1_1(bhs, MO_16) 6463 DO_LDFF1_LDNF1_1(bsu, MO_32) 6464 DO_LDFF1_LDNF1_1(bss, MO_32) 6465 DO_LDFF1_LDNF1_1(bdu, MO_64) 6466 DO_LDFF1_LDNF1_1(bds, MO_64) 6467 6468 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6469 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6470 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6471 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6472 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6473 6474 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6475 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6476 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6477 6478 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6479 6480 #undef DO_LDFF1_LDNF1_1 6481 #undef DO_LDFF1_LDNF1_2 6482 6483 /* 6484 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6485 */ 6486 6487 static inline QEMU_ALWAYS_INLINE 6488 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6489 uint32_t desc, const uintptr_t retaddr, 6490 const int esz, const int msz, const int N, uint32_t mtedesc, 6491 sve_ldst1_host_fn *host_fn, 6492 sve_ldst1_tlb_fn *tlb_fn) 6493 { 6494 const unsigned rd = simd_data(desc); 6495 const intptr_t reg_max = simd_oprsz(desc); 6496 intptr_t reg_off, reg_last, mem_off; 6497 SVEContLdSt info; 6498 void *host; 6499 int i, flags; 6500 6501 /* Find the active elements. */ 6502 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6503 /* The entire predicate was false; no store occurs. */ 6504 return; 6505 } 6506 6507 /* Probe the page(s). Exit with exception for any invalid page. */ 6508 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6509 6510 /* Handle watchpoints for all active elements. */ 6511 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6512 BP_MEM_WRITE, retaddr); 6513 6514 /* 6515 * Handle mte checks for all active elements. 6516 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6517 */ 6518 if (mtedesc) { 6519 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6520 mtedesc, retaddr); 6521 } 6522 6523 flags = info.page[0].flags | info.page[1].flags; 6524 if (unlikely(flags != 0)) { 6525 /* 6526 * At least one page includes MMIO. 6527 * Any bus operation can fail with cpu_transaction_failed, 6528 * which for ARM will raise SyncExternal. We cannot avoid 6529 * this fault and will leave with the store incomplete. 6530 */ 6531 mem_off = info.mem_off_first[0]; 6532 reg_off = info.reg_off_first[0]; 6533 reg_last = info.reg_off_last[1]; 6534 if (reg_last < 0) { 6535 reg_last = info.reg_off_split; 6536 if (reg_last < 0) { 6537 reg_last = info.reg_off_last[0]; 6538 } 6539 } 6540 6541 do { 6542 uint64_t pg = vg[reg_off >> 6]; 6543 do { 6544 if ((pg >> (reg_off & 63)) & 1) { 6545 for (i = 0; i < N; ++i) { 6546 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6547 addr + mem_off + (i << msz), retaddr); 6548 } 6549 } 6550 reg_off += 1 << esz; 6551 mem_off += N << msz; 6552 } while (reg_off & 63); 6553 } while (reg_off <= reg_last); 6554 return; 6555 } 6556 6557 mem_off = info.mem_off_first[0]; 6558 reg_off = info.reg_off_first[0]; 6559 reg_last = info.reg_off_last[0]; 6560 host = info.page[0].host; 6561 6562 set_helper_retaddr(retaddr); 6563 6564 while (reg_off <= reg_last) { 6565 uint64_t pg = vg[reg_off >> 6]; 6566 do { 6567 if ((pg >> (reg_off & 63)) & 1) { 6568 for (i = 0; i < N; ++i) { 6569 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6570 host + mem_off + (i << msz)); 6571 } 6572 } 6573 reg_off += 1 << esz; 6574 mem_off += N << msz; 6575 } while (reg_off <= reg_last && (reg_off & 63)); 6576 } 6577 6578 clear_helper_retaddr(); 6579 6580 /* 6581 * Use the slow path to manage the cross-page misalignment. 6582 * But we know this is RAM and cannot trap. 6583 */ 6584 mem_off = info.mem_off_split; 6585 if (unlikely(mem_off >= 0)) { 6586 reg_off = info.reg_off_split; 6587 for (i = 0; i < N; ++i) { 6588 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6589 addr + mem_off + (i << msz), retaddr); 6590 } 6591 } 6592 6593 mem_off = info.mem_off_first[1]; 6594 if (unlikely(mem_off >= 0)) { 6595 reg_off = info.reg_off_first[1]; 6596 reg_last = info.reg_off_last[1]; 6597 host = info.page[1].host; 6598 6599 set_helper_retaddr(retaddr); 6600 6601 do { 6602 uint64_t pg = vg[reg_off >> 6]; 6603 do { 6604 if ((pg >> (reg_off & 63)) & 1) { 6605 for (i = 0; i < N; ++i) { 6606 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6607 host + mem_off + (i << msz)); 6608 } 6609 } 6610 reg_off += 1 << esz; 6611 mem_off += N << msz; 6612 } while (reg_off & 63); 6613 } while (reg_off <= reg_last); 6614 6615 clear_helper_retaddr(); 6616 } 6617 } 6618 6619 static inline QEMU_ALWAYS_INLINE 6620 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6621 uint32_t desc, const uintptr_t ra, 6622 const int esz, const int msz, const int N, 6623 sve_ldst1_host_fn *host_fn, 6624 sve_ldst1_tlb_fn *tlb_fn) 6625 { 6626 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6627 int bit55 = extract64(addr, 55, 1); 6628 6629 /* Remove mtedesc from the normal sve descriptor. */ 6630 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6631 6632 /* Perform gross MTE suppression early. */ 6633 if (!tbi_check(mtedesc, bit55) || 6634 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6635 mtedesc = 0; 6636 } 6637 6638 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6639 } 6640 6641 #define DO_STN_1(N, NAME, ESZ) \ 6642 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6643 target_ulong addr, uint32_t desc) \ 6644 { \ 6645 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6646 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6647 } \ 6648 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6649 target_ulong addr, uint32_t desc) \ 6650 { \ 6651 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6652 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6653 } 6654 6655 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6656 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6657 target_ulong addr, uint32_t desc) \ 6658 { \ 6659 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6660 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6661 } \ 6662 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6663 target_ulong addr, uint32_t desc) \ 6664 { \ 6665 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6666 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6667 } \ 6668 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6669 target_ulong addr, uint32_t desc) \ 6670 { \ 6671 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6672 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6673 } \ 6674 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6675 target_ulong addr, uint32_t desc) \ 6676 { \ 6677 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6678 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6679 } 6680 6681 DO_STN_1(1, bb, MO_8) 6682 DO_STN_1(1, bh, MO_16) 6683 DO_STN_1(1, bs, MO_32) 6684 DO_STN_1(1, bd, MO_64) 6685 DO_STN_1(2, bb, MO_8) 6686 DO_STN_1(3, bb, MO_8) 6687 DO_STN_1(4, bb, MO_8) 6688 6689 DO_STN_2(1, hh, MO_16, MO_16) 6690 DO_STN_2(1, hs, MO_32, MO_16) 6691 DO_STN_2(1, hd, MO_64, MO_16) 6692 DO_STN_2(2, hh, MO_16, MO_16) 6693 DO_STN_2(3, hh, MO_16, MO_16) 6694 DO_STN_2(4, hh, MO_16, MO_16) 6695 6696 DO_STN_2(1, ss, MO_32, MO_32) 6697 DO_STN_2(1, sd, MO_64, MO_32) 6698 DO_STN_2(2, ss, MO_32, MO_32) 6699 DO_STN_2(3, ss, MO_32, MO_32) 6700 DO_STN_2(4, ss, MO_32, MO_32) 6701 6702 DO_STN_2(1, dd, MO_64, MO_64) 6703 DO_STN_2(2, dd, MO_64, MO_64) 6704 DO_STN_2(3, dd, MO_64, MO_64) 6705 DO_STN_2(4, dd, MO_64, MO_64) 6706 6707 #undef DO_STN_1 6708 #undef DO_STN_2 6709 6710 /* 6711 * Loads with a vector index. 6712 */ 6713 6714 /* 6715 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6716 */ 6717 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6718 6719 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6720 { 6721 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6722 } 6723 6724 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6725 { 6726 return *(int32_t *)(reg + H1_4(reg_ofs)); 6727 } 6728 6729 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6730 { 6731 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6732 } 6733 6734 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6735 { 6736 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6737 } 6738 6739 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6740 { 6741 return *(uint64_t *)(reg + reg_ofs); 6742 } 6743 6744 static inline QEMU_ALWAYS_INLINE 6745 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6746 target_ulong base, uint32_t desc, uintptr_t retaddr, 6747 uint32_t mtedesc, int esize, int msize, 6748 zreg_off_fn *off_fn, 6749 sve_ldst1_host_fn *host_fn, 6750 sve_ldst1_tlb_fn *tlb_fn) 6751 { 6752 const int mmu_idx = arm_env_mmu_index(env); 6753 const intptr_t reg_max = simd_oprsz(desc); 6754 const int scale = simd_data(desc); 6755 ARMVectorReg scratch; 6756 intptr_t reg_off; 6757 SVEHostPage info, info2; 6758 6759 memset(&scratch, 0, reg_max); 6760 reg_off = 0; 6761 do { 6762 uint64_t pg = vg[reg_off >> 6]; 6763 do { 6764 if (likely(pg & 1)) { 6765 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6766 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6767 6768 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6769 mmu_idx, retaddr); 6770 6771 if (likely(in_page >= msize)) { 6772 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6773 cpu_check_watchpoint(env_cpu(env), addr, msize, 6774 info.attrs, BP_MEM_READ, retaddr); 6775 } 6776 if (mtedesc && info.tagged) { 6777 mte_check(env, mtedesc, addr, retaddr); 6778 } 6779 if (unlikely(info.flags & TLB_MMIO)) { 6780 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6781 } else { 6782 set_helper_retaddr(retaddr); 6783 host_fn(&scratch, reg_off, info.host); 6784 clear_helper_retaddr(); 6785 } 6786 } else { 6787 /* Element crosses the page boundary. */ 6788 sve_probe_page(&info2, false, env, addr + in_page, 0, 6789 MMU_DATA_LOAD, mmu_idx, retaddr); 6790 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6791 cpu_check_watchpoint(env_cpu(env), addr, 6792 msize, info.attrs, 6793 BP_MEM_READ, retaddr); 6794 } 6795 if (mtedesc && info.tagged) { 6796 mte_check(env, mtedesc, addr, retaddr); 6797 } 6798 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6799 } 6800 } 6801 reg_off += esize; 6802 pg >>= esize; 6803 } while (reg_off & 63); 6804 } while (reg_off < reg_max); 6805 6806 /* Wait until all exceptions have been raised to write back. */ 6807 memcpy(vd, &scratch, reg_max); 6808 } 6809 6810 static inline QEMU_ALWAYS_INLINE 6811 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6812 target_ulong base, uint32_t desc, uintptr_t retaddr, 6813 int esize, int msize, zreg_off_fn *off_fn, 6814 sve_ldst1_host_fn *host_fn, 6815 sve_ldst1_tlb_fn *tlb_fn) 6816 { 6817 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6818 /* Remove mtedesc from the normal sve descriptor. */ 6819 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6820 6821 /* 6822 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6823 * offset base entirely over the address space hole to change the 6824 * pointer tag, or change the bit55 selector. So we could here 6825 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6826 */ 6827 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6828 esize, msize, off_fn, host_fn, tlb_fn); 6829 } 6830 6831 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6832 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6833 void *vm, target_ulong base, uint32_t desc) \ 6834 { \ 6835 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6836 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6837 } \ 6838 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6839 void *vm, target_ulong base, uint32_t desc) \ 6840 { \ 6841 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6842 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6843 } 6844 6845 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6846 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6847 void *vm, target_ulong base, uint32_t desc) \ 6848 { \ 6849 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6850 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6851 } \ 6852 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6853 void *vm, target_ulong base, uint32_t desc) \ 6854 { \ 6855 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6856 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6857 } 6858 6859 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6860 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6861 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6862 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6863 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6864 6865 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6866 DO_LD1_ZPZ_S(bss, zss, MO_8) 6867 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6868 DO_LD1_ZPZ_D(bds, zss, MO_8) 6869 DO_LD1_ZPZ_D(bds, zd, MO_8) 6870 6871 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6872 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6873 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6874 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6875 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6876 6877 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6878 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6879 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6880 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6881 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6882 6883 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6884 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6885 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6886 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6887 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6888 6889 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6890 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6891 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6892 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6893 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6894 6895 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6896 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6897 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6898 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6899 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6900 6901 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6902 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6903 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6904 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6905 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6906 6907 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6908 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6909 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6910 6911 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6912 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6913 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6914 6915 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6916 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6917 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6918 6919 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6920 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6921 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6922 6923 #undef DO_LD1_ZPZ_S 6924 #undef DO_LD1_ZPZ_D 6925 6926 /* First fault loads with a vector index. */ 6927 6928 /* 6929 * Common helpers for all gather first-faulting loads. 6930 */ 6931 6932 static inline QEMU_ALWAYS_INLINE 6933 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6934 target_ulong base, uint32_t desc, uintptr_t retaddr, 6935 uint32_t mtedesc, const int esz, const int msz, 6936 zreg_off_fn *off_fn, 6937 sve_ldst1_host_fn *host_fn, 6938 sve_ldst1_tlb_fn *tlb_fn) 6939 { 6940 const int mmu_idx = arm_env_mmu_index(env); 6941 const intptr_t reg_max = simd_oprsz(desc); 6942 const int scale = simd_data(desc); 6943 const int esize = 1 << esz; 6944 const int msize = 1 << msz; 6945 intptr_t reg_off; 6946 SVEHostPage info; 6947 target_ulong addr, in_page; 6948 ARMVectorReg scratch; 6949 6950 /* Skip to the first true predicate. */ 6951 reg_off = find_next_active(vg, 0, reg_max, esz); 6952 if (unlikely(reg_off >= reg_max)) { 6953 /* The entire predicate was false; no load occurs. */ 6954 memset(vd, 0, reg_max); 6955 return; 6956 } 6957 6958 /* Protect against overlap between vd and vm. */ 6959 if (unlikely(vd == vm)) { 6960 vm = memcpy(&scratch, vm, reg_max); 6961 } 6962 6963 /* 6964 * Probe the first element, allowing faults. 6965 */ 6966 addr = base + (off_fn(vm, reg_off) << scale); 6967 if (mtedesc) { 6968 mte_check(env, mtedesc, addr, retaddr); 6969 } 6970 tlb_fn(env, vd, reg_off, addr, retaddr); 6971 6972 /* After any fault, zero the other elements. */ 6973 swap_memzero(vd, reg_off); 6974 reg_off += esize; 6975 swap_memzero(vd + reg_off, reg_max - reg_off); 6976 6977 /* 6978 * Probe the remaining elements, not allowing faults. 6979 */ 6980 while (reg_off < reg_max) { 6981 uint64_t pg = vg[reg_off >> 6]; 6982 do { 6983 if (likely((pg >> (reg_off & 63)) & 1)) { 6984 addr = base + (off_fn(vm, reg_off) << scale); 6985 in_page = -(addr | TARGET_PAGE_MASK); 6986 6987 if (unlikely(in_page < msize)) { 6988 /* Stop if the element crosses a page boundary. */ 6989 goto fault; 6990 } 6991 6992 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6993 mmu_idx, retaddr); 6994 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6995 goto fault; 6996 } 6997 if (unlikely(info.flags & TLB_WATCHPOINT) && 6998 (cpu_watchpoint_address_matches 6999 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 7000 goto fault; 7001 } 7002 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 7003 goto fault; 7004 } 7005 7006 set_helper_retaddr(retaddr); 7007 host_fn(vd, reg_off, info.host); 7008 clear_helper_retaddr(); 7009 } 7010 reg_off += esize; 7011 } while (reg_off & 63); 7012 } 7013 return; 7014 7015 fault: 7016 record_fault(env, reg_off, reg_max); 7017 } 7018 7019 static inline QEMU_ALWAYS_INLINE 7020 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7021 target_ulong base, uint32_t desc, uintptr_t retaddr, 7022 const int esz, const int msz, 7023 zreg_off_fn *off_fn, 7024 sve_ldst1_host_fn *host_fn, 7025 sve_ldst1_tlb_fn *tlb_fn) 7026 { 7027 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7028 /* Remove mtedesc from the normal sve descriptor. */ 7029 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7030 7031 /* 7032 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7033 * offset base entirely over the address space hole to change the 7034 * pointer tag, or change the bit55 selector. So we could here 7035 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7036 */ 7037 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7038 esz, msz, off_fn, host_fn, tlb_fn); 7039 } 7040 7041 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 7042 void HELPER(sve_ldff##MEM##_##OFS) \ 7043 (CPUARMState *env, void *vd, void *vg, \ 7044 void *vm, target_ulong base, uint32_t desc) \ 7045 { \ 7046 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 7047 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7048 } \ 7049 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7050 (CPUARMState *env, void *vd, void *vg, \ 7051 void *vm, target_ulong base, uint32_t desc) \ 7052 { \ 7053 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 7054 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7055 } 7056 7057 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 7058 void HELPER(sve_ldff##MEM##_##OFS) \ 7059 (CPUARMState *env, void *vd, void *vg, \ 7060 void *vm, target_ulong base, uint32_t desc) \ 7061 { \ 7062 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 7063 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7064 } \ 7065 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7066 (CPUARMState *env, void *vd, void *vg, \ 7067 void *vm, target_ulong base, uint32_t desc) \ 7068 { \ 7069 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 7070 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7071 } 7072 7073 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 7074 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 7075 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 7076 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 7077 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 7078 7079 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 7080 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 7081 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 7082 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 7083 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 7084 7085 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 7086 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 7087 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 7088 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 7089 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 7090 7091 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 7092 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 7093 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 7094 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 7095 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 7096 7097 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 7098 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 7099 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 7100 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 7101 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 7102 7103 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 7104 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 7105 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 7106 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 7107 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 7108 7109 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 7110 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 7111 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 7112 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 7113 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 7114 7115 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 7116 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 7117 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 7118 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 7119 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 7120 7121 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 7122 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 7123 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 7124 7125 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 7126 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 7127 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 7128 7129 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 7130 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 7131 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 7132 7133 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 7134 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 7135 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 7136 7137 /* Stores with a vector index. */ 7138 7139 static inline QEMU_ALWAYS_INLINE 7140 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7141 target_ulong base, uint32_t desc, uintptr_t retaddr, 7142 uint32_t mtedesc, int esize, int msize, 7143 zreg_off_fn *off_fn, 7144 sve_ldst1_host_fn *host_fn, 7145 sve_ldst1_tlb_fn *tlb_fn) 7146 { 7147 const int mmu_idx = arm_env_mmu_index(env); 7148 const intptr_t reg_max = simd_oprsz(desc); 7149 const int scale = simd_data(desc); 7150 void *host[ARM_MAX_VQ * 4]; 7151 intptr_t reg_off, i; 7152 SVEHostPage info, info2; 7153 7154 /* 7155 * Probe all of the elements for host addresses and flags. 7156 */ 7157 i = reg_off = 0; 7158 do { 7159 uint64_t pg = vg[reg_off >> 6]; 7160 do { 7161 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7162 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7163 7164 host[i] = NULL; 7165 if (likely((pg >> (reg_off & 63)) & 1)) { 7166 if (likely(in_page >= msize)) { 7167 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 7168 mmu_idx, retaddr); 7169 if (!(info.flags & TLB_MMIO)) { 7170 host[i] = info.host; 7171 } 7172 } else { 7173 /* 7174 * Element crosses the page boundary. 7175 * Probe both pages, but do not record the host address, 7176 * so that we use the slow path. 7177 */ 7178 sve_probe_page(&info, false, env, addr, 0, 7179 MMU_DATA_STORE, mmu_idx, retaddr); 7180 sve_probe_page(&info2, false, env, addr + in_page, 0, 7181 MMU_DATA_STORE, mmu_idx, retaddr); 7182 info.flags |= info2.flags; 7183 } 7184 7185 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7186 cpu_check_watchpoint(env_cpu(env), addr, msize, 7187 info.attrs, BP_MEM_WRITE, retaddr); 7188 } 7189 7190 if (mtedesc && info.tagged) { 7191 mte_check(env, mtedesc, addr, retaddr); 7192 } 7193 } 7194 i += 1; 7195 reg_off += esize; 7196 } while (reg_off & 63); 7197 } while (reg_off < reg_max); 7198 7199 /* 7200 * Now that we have recognized all exceptions except SyncExternal 7201 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7202 * 7203 * Note for the common case of an element in RAM, not crossing a page 7204 * boundary, we have stored the host address in host[]. This doubles 7205 * as a first-level check against the predicate, since only enabled 7206 * elements have non-null host addresses. 7207 */ 7208 i = reg_off = 0; 7209 do { 7210 void *h = host[i]; 7211 if (likely(h != NULL)) { 7212 set_helper_retaddr(retaddr); 7213 host_fn(vd, reg_off, h); 7214 clear_helper_retaddr(); 7215 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7216 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7217 tlb_fn(env, vd, reg_off, addr, retaddr); 7218 } 7219 i += 1; 7220 reg_off += esize; 7221 } while (reg_off < reg_max); 7222 } 7223 7224 static inline QEMU_ALWAYS_INLINE 7225 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7226 target_ulong base, uint32_t desc, uintptr_t retaddr, 7227 int esize, int msize, zreg_off_fn *off_fn, 7228 sve_ldst1_host_fn *host_fn, 7229 sve_ldst1_tlb_fn *tlb_fn) 7230 { 7231 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7232 /* Remove mtedesc from the normal sve descriptor. */ 7233 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7234 7235 /* 7236 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7237 * offset base entirely over the address space hole to change the 7238 * pointer tag, or change the bit55 selector. So we could here 7239 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7240 */ 7241 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7242 esize, msize, off_fn, host_fn, tlb_fn); 7243 } 7244 7245 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7246 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7247 void *vm, target_ulong base, uint32_t desc) \ 7248 { \ 7249 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7250 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7251 } \ 7252 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7253 void *vm, target_ulong base, uint32_t desc) \ 7254 { \ 7255 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7256 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7257 } 7258 7259 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7260 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7261 void *vm, target_ulong base, uint32_t desc) \ 7262 { \ 7263 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7264 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7265 } \ 7266 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7267 void *vm, target_ulong base, uint32_t desc) \ 7268 { \ 7269 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7270 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7271 } 7272 7273 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7274 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7275 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7276 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7277 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7278 7279 DO_ST1_ZPZ_S(bs, zss, MO_8) 7280 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7281 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7282 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7283 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7284 7285 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7286 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7287 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7288 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7289 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7290 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7291 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7292 7293 DO_ST1_ZPZ_D(bd, zss, MO_8) 7294 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7295 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7296 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7297 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7298 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7299 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7300 7301 DO_ST1_ZPZ_D(bd, zd, MO_8) 7302 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7303 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7304 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7305 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7306 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7307 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7308 7309 #undef DO_ST1_ZPZ_S 7310 #undef DO_ST1_ZPZ_D 7311 7312 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7313 { 7314 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7315 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7316 7317 for (i = 0; i < opr_sz; ++i) { 7318 d[i] = n[i] ^ m[i] ^ k[i]; 7319 } 7320 } 7321 7322 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7323 { 7324 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7325 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7326 7327 for (i = 0; i < opr_sz; ++i) { 7328 d[i] = n[i] ^ (m[i] & ~k[i]); 7329 } 7330 } 7331 7332 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7333 { 7334 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7335 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7336 7337 for (i = 0; i < opr_sz; ++i) { 7338 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7339 } 7340 } 7341 7342 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7343 { 7344 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7345 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7346 7347 for (i = 0; i < opr_sz; ++i) { 7348 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7349 } 7350 } 7351 7352 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7353 { 7354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7355 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7356 7357 for (i = 0; i < opr_sz; ++i) { 7358 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7359 } 7360 } 7361 7362 /* 7363 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7364 * See hasless(v,1) from 7365 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7366 */ 7367 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7368 { 7369 int bits = 8 << esz; 7370 uint64_t ones = dup_const(esz, 1); 7371 uint64_t signs = ones << (bits - 1); 7372 uint64_t cmp0, cmp1; 7373 7374 cmp1 = dup_const(esz, n); 7375 cmp0 = cmp1 ^ m0; 7376 cmp1 = cmp1 ^ m1; 7377 cmp0 = (cmp0 - ones) & ~cmp0; 7378 cmp1 = (cmp1 - ones) & ~cmp1; 7379 return (cmp0 | cmp1) & signs; 7380 } 7381 7382 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7383 uint32_t desc, int esz, bool nmatch) 7384 { 7385 uint16_t esz_mask = pred_esz_masks[esz]; 7386 intptr_t opr_sz = simd_oprsz(desc); 7387 uint32_t flags = PREDTEST_INIT; 7388 intptr_t i, j, k; 7389 7390 for (i = 0; i < opr_sz; i += 16) { 7391 uint64_t m0 = *(uint64_t *)(vm + i); 7392 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7393 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7394 uint16_t out = 0; 7395 7396 for (j = 0; j < 16; j += 8) { 7397 uint64_t n = *(uint64_t *)(vn + i + j); 7398 7399 for (k = 0; k < 8; k += 1 << esz) { 7400 if (pg & (1 << (j + k))) { 7401 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7402 out |= (o ^ nmatch) << (j + k); 7403 } 7404 } 7405 } 7406 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7407 flags = iter_predtest_fwd(out, pg, flags); 7408 } 7409 return flags; 7410 } 7411 7412 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7413 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7414 { \ 7415 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7416 } 7417 7418 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7419 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7420 7421 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7422 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7423 7424 #undef DO_PPZZ_MATCH 7425 7426 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7427 uint32_t desc) 7428 { 7429 ARMVectorReg scratch; 7430 intptr_t i, j; 7431 intptr_t opr_sz = simd_oprsz(desc); 7432 uint32_t *d = vd, *n = vn, *m = vm; 7433 uint8_t *pg = vg; 7434 7435 if (d == n) { 7436 n = memcpy(&scratch, n, opr_sz); 7437 if (d == m) { 7438 m = n; 7439 } 7440 } else if (d == m) { 7441 m = memcpy(&scratch, m, opr_sz); 7442 } 7443 7444 for (i = 0; i < opr_sz; i += 4) { 7445 uint64_t count = 0; 7446 uint8_t pred; 7447 7448 pred = pg[H1(i >> 3)] >> (i & 7); 7449 if (pred & 1) { 7450 uint32_t nn = n[H4(i >> 2)]; 7451 7452 for (j = 0; j <= i; j += 4) { 7453 pred = pg[H1(j >> 3)] >> (j & 7); 7454 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7455 ++count; 7456 } 7457 } 7458 } 7459 d[H4(i >> 2)] = count; 7460 } 7461 } 7462 7463 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7464 uint32_t desc) 7465 { 7466 ARMVectorReg scratch; 7467 intptr_t i, j; 7468 intptr_t opr_sz = simd_oprsz(desc); 7469 uint64_t *d = vd, *n = vn, *m = vm; 7470 uint8_t *pg = vg; 7471 7472 if (d == n) { 7473 n = memcpy(&scratch, n, opr_sz); 7474 if (d == m) { 7475 m = n; 7476 } 7477 } else if (d == m) { 7478 m = memcpy(&scratch, m, opr_sz); 7479 } 7480 7481 for (i = 0; i < opr_sz / 8; ++i) { 7482 uint64_t count = 0; 7483 if (pg[H1(i)] & 1) { 7484 uint64_t nn = n[i]; 7485 for (j = 0; j <= i; ++j) { 7486 if ((pg[H1(j)] & 1) && nn == m[j]) { 7487 ++count; 7488 } 7489 } 7490 } 7491 d[i] = count; 7492 } 7493 } 7494 7495 /* 7496 * Returns the number of bytes in m0 and m1 that match n. 7497 * Unlike do_match2 we don't just need true/false, we need an exact count. 7498 * This requires two extra logical operations. 7499 */ 7500 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7501 { 7502 const uint64_t mask = dup_const(MO_8, 0x7f); 7503 uint64_t cmp0, cmp1; 7504 7505 cmp1 = dup_const(MO_8, n); 7506 cmp0 = cmp1 ^ m0; 7507 cmp1 = cmp1 ^ m1; 7508 7509 /* 7510 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7511 * 2: carry in to msb if byte != 0 (+ mask) 7512 * 3: set msb if cmp has msb set (| cmp) 7513 * 4: set ~msb to ignore them (| mask) 7514 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7515 * 5: invert, resulting in 0x80 if and only if byte == 0. 7516 */ 7517 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7518 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7519 7520 /* 7521 * Combine the two compares in a way that the bits do 7522 * not overlap, and so preserves the count of set bits. 7523 * If the host has an efficient instruction for ctpop, 7524 * then ctpop(x) + ctpop(y) has the same number of 7525 * operations as ctpop(x | (y >> 1)). If the host does 7526 * not have an efficient ctpop, then we only want to 7527 * use it once. 7528 */ 7529 return ctpop64(cmp0 | (cmp1 >> 1)); 7530 } 7531 7532 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7533 { 7534 intptr_t i, j; 7535 intptr_t opr_sz = simd_oprsz(desc); 7536 7537 for (i = 0; i < opr_sz; i += 16) { 7538 uint64_t n0 = *(uint64_t *)(vn + i); 7539 uint64_t m0 = *(uint64_t *)(vm + i); 7540 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7541 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7542 uint64_t out0 = 0; 7543 uint64_t out1 = 0; 7544 7545 for (j = 0; j < 64; j += 8) { 7546 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7547 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7548 out0 |= cnt0 << j; 7549 out1 |= cnt1 << j; 7550 } 7551 7552 *(uint64_t *)(vd + i) = out0; 7553 *(uint64_t *)(vd + i + 8) = out1; 7554 } 7555 } 7556 7557 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7558 { 7559 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7560 int shr = simd_data(desc); 7561 int shl = 8 - shr; 7562 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7563 uint64_t *d = vd, *n = vn, *m = vm; 7564 7565 for (i = 0; i < opr_sz; ++i) { 7566 uint64_t t = n[i] ^ m[i]; 7567 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7568 } 7569 } 7570 7571 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7572 { 7573 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7574 int shr = simd_data(desc); 7575 int shl = 16 - shr; 7576 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7577 uint64_t *d = vd, *n = vn, *m = vm; 7578 7579 for (i = 0; i < opr_sz; ++i) { 7580 uint64_t t = n[i] ^ m[i]; 7581 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7582 } 7583 } 7584 7585 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7586 { 7587 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7588 int shr = simd_data(desc); 7589 uint32_t *d = vd, *n = vn, *m = vm; 7590 7591 for (i = 0; i < opr_sz; ++i) { 7592 d[i] = ror32(n[i] ^ m[i], shr); 7593 } 7594 } 7595 7596 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7597 float_status *status, uint32_t desc) 7598 { 7599 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7600 7601 for (s = 0; s < opr_sz; ++s) { 7602 float32 *n = vn + s * sizeof(float32) * 4; 7603 float32 *m = vm + s * sizeof(float32) * 4; 7604 float32 *a = va + s * sizeof(float32) * 4; 7605 float32 *d = vd + s * sizeof(float32) * 4; 7606 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7607 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7608 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7609 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7610 float32 p0, p1; 7611 7612 /* i = 0, j = 0 */ 7613 p0 = float32_mul(n00, m00, status); 7614 p1 = float32_mul(n01, m01, status); 7615 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7616 7617 /* i = 0, j = 1 */ 7618 p0 = float32_mul(n00, m10, status); 7619 p1 = float32_mul(n01, m11, status); 7620 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7621 7622 /* i = 1, j = 0 */ 7623 p0 = float32_mul(n10, m00, status); 7624 p1 = float32_mul(n11, m01, status); 7625 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7626 7627 /* i = 1, j = 1 */ 7628 p0 = float32_mul(n10, m10, status); 7629 p1 = float32_mul(n11, m11, status); 7630 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7631 } 7632 } 7633 7634 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7635 float_status *status, uint32_t desc) 7636 { 7637 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7638 7639 for (s = 0; s < opr_sz; ++s) { 7640 float64 *n = vn + s * sizeof(float64) * 4; 7641 float64 *m = vm + s * sizeof(float64) * 4; 7642 float64 *a = va + s * sizeof(float64) * 4; 7643 float64 *d = vd + s * sizeof(float64) * 4; 7644 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7645 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7646 float64 p0, p1; 7647 7648 /* i = 0, j = 0 */ 7649 p0 = float64_mul(n00, m00, status); 7650 p1 = float64_mul(n01, m01, status); 7651 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7652 7653 /* i = 0, j = 1 */ 7654 p0 = float64_mul(n00, m10, status); 7655 p1 = float64_mul(n01, m11, status); 7656 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7657 7658 /* i = 1, j = 0 */ 7659 p0 = float64_mul(n10, m00, status); 7660 p1 = float64_mul(n11, m01, status); 7661 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7662 7663 /* i = 1, j = 1 */ 7664 p0 = float64_mul(n10, m10, status); 7665 p1 = float64_mul(n11, m11, status); 7666 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7667 } 7668 } 7669 7670 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7671 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7672 float_status *status, uint32_t desc) \ 7673 { \ 7674 intptr_t i = simd_oprsz(desc); \ 7675 uint64_t *g = vg; \ 7676 do { \ 7677 uint64_t pg = g[(i - 1) >> 6]; \ 7678 do { \ 7679 i -= sizeof(TYPEW); \ 7680 if (likely((pg >> (i & 63)) & 1)) { \ 7681 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7682 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7683 } \ 7684 } while (i & 63); \ 7685 } while (i != 0); \ 7686 } 7687 7688 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7689 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7690 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7691 7692 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7693 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7694 float_status *status, uint32_t desc) \ 7695 { \ 7696 intptr_t i = simd_oprsz(desc); \ 7697 uint64_t *g = vg; \ 7698 do { \ 7699 uint64_t pg = g[(i - 1) >> 6]; \ 7700 do { \ 7701 i -= sizeof(TYPEW); \ 7702 if (likely((pg >> (i & 63)) & 1)) { \ 7703 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7704 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7705 } \ 7706 } while (i & 63); \ 7707 } while (i != 0); \ 7708 } 7709 7710 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7711 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7712 7713 #undef DO_FCVTLT 7714 #undef DO_FCVTNT 7715