1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/page-protection.h" 25 #include "exec/helper-proto.h" 26 #include "tcg/tcg-gvec-desc.h" 27 #include "fpu/softfloat.h" 28 #include "tcg/tcg.h" 29 #include "vec_internal.h" 30 #include "sve_ldst_internal.h" 31 #include "hw/core/tcg-cpu-ops.h" 32 33 34 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 35 * 36 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 37 * and bit 0 set if C is set. Compare the definitions of these variables 38 * within CPUARMState. 39 */ 40 41 /* For no G bits set, NZCV = C. */ 42 #define PREDTEST_INIT 1 43 44 /* This is an iterative function, called for each Pd and Pg word 45 * moving forward. 46 */ 47 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 48 { 49 if (likely(g)) { 50 /* Compute N from first D & G. 51 Use bit 2 to signal first G bit seen. */ 52 if (!(flags & 4)) { 53 flags |= ((d & (g & -g)) != 0) << 31; 54 flags |= 4; 55 } 56 57 /* Accumulate Z from each D & G. */ 58 flags |= ((d & g) != 0) << 1; 59 60 /* Compute C from last !(D & G). Replace previous. */ 61 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 62 } 63 return flags; 64 } 65 66 /* This is an iterative function, called for each Pd and Pg word 67 * moving backward. 68 */ 69 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 70 { 71 if (likely(g)) { 72 /* Compute C from first (i.e last) !(D & G). 73 Use bit 2 to signal first G bit seen. */ 74 if (!(flags & 4)) { 75 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 76 flags |= (d & pow2floor(g)) == 0; 77 } 78 79 /* Accumulate Z from each D & G. */ 80 flags |= ((d & g) != 0) << 1; 81 82 /* Compute N from last (i.e first) D & G. Replace previous. */ 83 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 84 } 85 return flags; 86 } 87 88 /* The same for a single word predicate. */ 89 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 90 { 91 return iter_predtest_fwd(d, g, PREDTEST_INIT); 92 } 93 94 /* The same for a multi-word predicate. */ 95 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 96 { 97 uint32_t flags = PREDTEST_INIT; 98 uint64_t *d = vd, *g = vg; 99 uintptr_t i = 0; 100 101 do { 102 flags = iter_predtest_fwd(d[i], g[i], flags); 103 } while (++i < words); 104 105 return flags; 106 } 107 108 /* Similarly for single word elements. */ 109 static inline uint64_t expand_pred_s(uint8_t byte) 110 { 111 static const uint64_t word[] = { 112 [0x01] = 0x00000000ffffffffull, 113 [0x10] = 0xffffffff00000000ull, 114 [0x11] = 0xffffffffffffffffull, 115 }; 116 return word[byte & 0x11]; 117 } 118 119 #define LOGICAL_PPPP(NAME, FUNC) \ 120 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 121 { \ 122 uintptr_t opr_sz = simd_oprsz(desc); \ 123 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 124 uintptr_t i; \ 125 for (i = 0; i < opr_sz / 8; ++i) { \ 126 d[i] = FUNC(n[i], m[i], g[i]); \ 127 } \ 128 } 129 130 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 131 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 132 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 133 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 134 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 135 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 136 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 137 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 138 139 LOGICAL_PPPP(sve_and_pppp, DO_AND) 140 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 141 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 142 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 143 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 144 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 145 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 146 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 147 148 #undef DO_AND 149 #undef DO_BIC 150 #undef DO_EOR 151 #undef DO_ORR 152 #undef DO_ORN 153 #undef DO_NOR 154 #undef DO_NAND 155 #undef DO_SEL 156 #undef LOGICAL_PPPP 157 158 /* Fully general three-operand expander, controlled by a predicate. 159 * This is complicated by the host-endian storage of the register file. 160 */ 161 /* ??? I don't expect the compiler could ever vectorize this itself. 162 * With some tables we can convert bit masks to byte masks, and with 163 * extra care wrt byte/word ordering we could use gcc generic vectors 164 * and do 16 bytes at a time. 165 */ 166 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 167 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 168 { \ 169 intptr_t i, opr_sz = simd_oprsz(desc); \ 170 for (i = 0; i < opr_sz; ) { \ 171 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 172 do { \ 173 if (pg & 1) { \ 174 TYPE nn = *(TYPE *)(vn + H(i)); \ 175 TYPE mm = *(TYPE *)(vm + H(i)); \ 176 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 177 } \ 178 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 179 } while (i & 15); \ 180 } \ 181 } 182 183 /* Similarly, specialized for 64-bit operands. */ 184 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 185 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 186 { \ 187 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 188 TYPE *d = vd, *n = vn, *m = vm; \ 189 uint8_t *pg = vg; \ 190 for (i = 0; i < opr_sz; i += 1) { \ 191 if (pg[H1(i)] & 1) { \ 192 TYPE nn = n[i], mm = m[i]; \ 193 d[i] = OP(nn, mm); \ 194 } \ 195 } \ 196 } 197 198 #define DO_AND(N, M) (N & M) 199 #define DO_EOR(N, M) (N ^ M) 200 #define DO_ORR(N, M) (N | M) 201 #define DO_BIC(N, M) (N & ~M) 202 #define DO_ADD(N, M) (N + M) 203 #define DO_SUB(N, M) (N - M) 204 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 205 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 206 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 207 #define DO_MUL(N, M) (N * M) 208 209 210 /* 211 * We must avoid the C undefined behaviour cases: division by 212 * zero and signed division of INT_MIN by -1. Both of these 213 * have architecturally defined required results for Arm. 214 * We special case all signed divisions by -1 to avoid having 215 * to deduce the minimum integer for the type involved. 216 */ 217 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 218 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 219 220 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 221 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 222 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 223 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 224 225 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 226 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 227 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 228 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 229 230 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 231 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 232 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 233 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 234 235 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 236 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 237 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 238 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 239 240 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 241 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 242 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 243 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 244 245 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 246 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 247 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 248 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 249 250 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 251 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 252 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 253 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 254 255 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 256 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 257 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 258 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 259 260 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 261 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 262 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 263 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 264 265 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 266 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 267 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 268 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 269 270 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 271 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 272 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 273 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 274 275 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 276 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 277 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 278 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 279 280 /* Because the computation type is at least twice as large as required, 281 these work for both signed and unsigned source types. */ 282 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 283 { 284 return (n * m) >> 8; 285 } 286 287 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 288 { 289 return (n * m) >> 16; 290 } 291 292 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 293 { 294 return (n * m) >> 32; 295 } 296 297 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 298 { 299 uint64_t lo, hi; 300 muls64(&lo, &hi, n, m); 301 return hi; 302 } 303 304 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 305 { 306 uint64_t lo, hi; 307 mulu64(&lo, &hi, n, m); 308 return hi; 309 } 310 311 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 312 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 313 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 314 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 315 316 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 317 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 318 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 319 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 320 321 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 322 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 323 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 324 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 325 326 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 327 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 328 329 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 330 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 331 332 /* Note that all bits of the shift are significant 333 and not modulo the element size. */ 334 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 335 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 336 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 337 338 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 339 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 340 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 341 342 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 343 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 344 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 345 346 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 347 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 348 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 349 350 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 351 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 352 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 353 354 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 355 { 356 int8_t n1 = n, n2 = n >> 8; 357 return m + n1 + n2; 358 } 359 360 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 361 { 362 int16_t n1 = n, n2 = n >> 16; 363 return m + n1 + n2; 364 } 365 366 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 367 { 368 int32_t n1 = n, n2 = n >> 32; 369 return m + n1 + n2; 370 } 371 372 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 373 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 374 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 375 376 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 377 { 378 uint8_t n1 = n, n2 = n >> 8; 379 return m + n1 + n2; 380 } 381 382 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 383 { 384 uint16_t n1 = n, n2 = n >> 16; 385 return m + n1 + n2; 386 } 387 388 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 389 { 390 uint32_t n1 = n, n2 = n >> 32; 391 return m + n1 + n2; 392 } 393 394 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 395 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 396 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 397 398 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 399 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 400 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 401 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 402 403 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 404 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 405 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 406 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 407 408 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 409 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 410 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 411 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 412 413 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 414 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 415 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 416 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 417 418 /* 419 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 420 * We pass in a pointer to a dummy saturation field to trigger 421 * the saturating arithmetic but discard the information about 422 * whether it has occurred. 423 */ 424 #define do_sqshl_b(n, m) \ 425 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 426 #define do_sqshl_h(n, m) \ 427 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 428 #define do_sqshl_s(n, m) \ 429 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 430 #define do_sqshl_d(n, m) \ 431 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 432 433 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 434 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 435 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 436 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 437 438 #define do_uqshl_b(n, m) \ 439 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 440 #define do_uqshl_h(n, m) \ 441 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 442 #define do_uqshl_s(n, m) \ 443 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 444 #define do_uqshl_d(n, m) \ 445 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 446 447 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 448 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 449 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 450 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 451 452 #define do_sqrshl_b(n, m) \ 453 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 454 #define do_sqrshl_h(n, m) \ 455 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 456 #define do_sqrshl_s(n, m) \ 457 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 458 #define do_sqrshl_d(n, m) \ 459 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 460 461 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 462 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 463 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 464 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 465 466 #undef do_sqrshl_d 467 468 #define do_uqrshl_b(n, m) \ 469 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 470 #define do_uqrshl_h(n, m) \ 471 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 472 #define do_uqrshl_s(n, m) \ 473 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 474 #define do_uqrshl_d(n, m) \ 475 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 476 477 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 478 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 479 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 480 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 481 482 #undef do_uqrshl_d 483 484 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 485 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 486 487 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 488 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 489 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 490 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 491 492 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 493 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 494 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 495 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 496 497 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 498 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 499 500 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 501 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 502 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 503 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 504 505 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 506 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 507 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 508 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 509 510 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 511 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 512 513 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 514 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 515 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 516 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 517 518 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 519 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 520 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 521 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 522 523 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 524 { 525 return val >= max ? max : val <= min ? min : val; 526 } 527 528 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 529 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 530 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 531 532 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 533 { 534 int64_t r = n + m; 535 if (((r ^ n) & ~(n ^ m)) < 0) { 536 /* Signed overflow. */ 537 return r < 0 ? INT64_MAX : INT64_MIN; 538 } 539 return r; 540 } 541 542 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 543 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 544 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 545 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 546 547 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 548 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 549 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 550 551 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 552 { 553 uint64_t r = n + m; 554 return r < n ? UINT64_MAX : r; 555 } 556 557 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 558 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 559 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 560 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 561 562 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 563 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 564 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 565 566 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 567 { 568 int64_t r = n - m; 569 if (((r ^ n) & (n ^ m)) < 0) { 570 /* Signed overflow. */ 571 return r < 0 ? INT64_MAX : INT64_MIN; 572 } 573 return r; 574 } 575 576 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 577 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 578 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 579 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 580 581 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 582 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 583 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 584 585 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 586 { 587 return n > m ? n - m : 0; 588 } 589 590 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 591 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 592 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 593 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 594 595 #define DO_SUQADD_B(n, m) \ 596 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 597 #define DO_SUQADD_H(n, m) \ 598 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 599 #define DO_SUQADD_S(n, m) \ 600 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 601 602 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 603 { 604 uint64_t r = n + m; 605 606 if (n < 0) { 607 /* Note that m - abs(n) cannot underflow. */ 608 if (r > INT64_MAX) { 609 /* Result is either very large positive or negative. */ 610 if (m > -n) { 611 /* m > abs(n), so r is a very large positive. */ 612 return INT64_MAX; 613 } 614 /* Result is negative. */ 615 } 616 } else { 617 /* Both inputs are positive: check for overflow. */ 618 if (r < m || r > INT64_MAX) { 619 return INT64_MAX; 620 } 621 } 622 return r; 623 } 624 625 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 626 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 627 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 628 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 629 630 #define DO_USQADD_B(n, m) \ 631 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 632 #define DO_USQADD_H(n, m) \ 633 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 634 #define DO_USQADD_S(n, m) \ 635 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 636 637 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 638 { 639 uint64_t r = n + m; 640 641 if (m < 0) { 642 return n < -m ? 0 : r; 643 } 644 return r < n ? UINT64_MAX : r; 645 } 646 647 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 648 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 649 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 650 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 651 652 #undef DO_ZPZZ 653 #undef DO_ZPZZ_D 654 655 /* 656 * Three operand expander, operating on element pairs. 657 * If the slot I is even, the elements from from VN {I, I+1}. 658 * If the slot I is odd, the elements from from VM {I-1, I}. 659 * Load all of the input elements in each pair before overwriting output. 660 */ 661 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 662 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 663 { \ 664 intptr_t i, opr_sz = simd_oprsz(desc); \ 665 for (i = 0; i < opr_sz; ) { \ 666 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 667 do { \ 668 TYPE n0 = *(TYPE *)(vn + H(i)); \ 669 TYPE m0 = *(TYPE *)(vm + H(i)); \ 670 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 671 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 672 if (pg & 1) { \ 673 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 674 } \ 675 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 676 if (pg & 1) { \ 677 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 678 } \ 679 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 680 } while (i & 15); \ 681 } \ 682 } 683 684 /* Similarly, specialized for 64-bit operands. */ 685 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 686 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 687 { \ 688 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 689 TYPE *d = vd, *n = vn, *m = vm; \ 690 uint8_t *pg = vg; \ 691 for (i = 0; i < opr_sz; i += 2) { \ 692 TYPE n0 = n[i], n1 = n[i + 1]; \ 693 TYPE m0 = m[i], m1 = m[i + 1]; \ 694 if (pg[H1(i)] & 1) { \ 695 d[i] = OP(n0, n1); \ 696 } \ 697 if (pg[H1(i + 1)] & 1) { \ 698 d[i + 1] = OP(m0, m1); \ 699 } \ 700 } \ 701 } 702 703 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 704 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 705 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 706 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 707 708 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 709 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 711 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 712 713 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 714 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 716 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 717 718 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 719 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 721 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 722 723 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 724 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 726 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 727 728 #undef DO_ZPZZ_PAIR 729 #undef DO_ZPZZ_PAIR_D 730 731 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 732 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 733 float_status *status, uint32_t desc) \ 734 { \ 735 intptr_t i, opr_sz = simd_oprsz(desc); \ 736 for (i = 0; i < opr_sz; ) { \ 737 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 738 do { \ 739 TYPE n0 = *(TYPE *)(vn + H(i)); \ 740 TYPE m0 = *(TYPE *)(vm + H(i)); \ 741 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 742 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 743 if (pg & 1) { \ 744 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 745 } \ 746 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 747 if (pg & 1) { \ 748 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 749 } \ 750 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 751 } while (i & 15); \ 752 } \ 753 } 754 755 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 756 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 758 759 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 760 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 762 763 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 764 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 766 767 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 768 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 770 771 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 772 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 774 775 #undef DO_ZPZZ_PAIR_FP 776 777 /* Three-operand expander, controlled by a predicate, in which the 778 * third operand is "wide". That is, for D = N op M, the same 64-bit 779 * value of M is used with all of the narrower values of N. 780 */ 781 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 782 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 783 { \ 784 intptr_t i, opr_sz = simd_oprsz(desc); \ 785 for (i = 0; i < opr_sz; ) { \ 786 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 787 TYPEW mm = *(TYPEW *)(vm + i); \ 788 do { \ 789 if (pg & 1) { \ 790 TYPE nn = *(TYPE *)(vn + H(i)); \ 791 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 792 } \ 793 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 794 } while (i & 7); \ 795 } \ 796 } 797 798 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 799 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 800 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 801 802 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 803 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 804 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 805 806 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 807 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 808 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 809 810 #undef DO_ZPZW 811 812 /* Fully general two-operand expander, controlled by a predicate. 813 */ 814 #define DO_ZPZ(NAME, TYPE, H, OP) \ 815 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 816 { \ 817 intptr_t i, opr_sz = simd_oprsz(desc); \ 818 for (i = 0; i < opr_sz; ) { \ 819 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 820 do { \ 821 if (pg & 1) { \ 822 TYPE nn = *(TYPE *)(vn + H(i)); \ 823 *(TYPE *)(vd + H(i)) = OP(nn); \ 824 } \ 825 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 826 } while (i & 15); \ 827 } \ 828 } 829 830 /* Similarly, specialized for 64-bit operands. */ 831 #define DO_ZPZ_D(NAME, TYPE, OP) \ 832 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 833 { \ 834 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 835 TYPE *d = vd, *n = vn; \ 836 uint8_t *pg = vg; \ 837 for (i = 0; i < opr_sz; i += 1) { \ 838 if (pg[H1(i)] & 1) { \ 839 TYPE nn = n[i]; \ 840 d[i] = OP(nn); \ 841 } \ 842 } \ 843 } 844 845 #define DO_CLS_B(N) (clrsb32(N) - 24) 846 #define DO_CLS_H(N) (clrsb32(N) - 16) 847 848 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 849 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 850 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 851 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 852 853 #define DO_CLZ_B(N) (clz32(N) - 24) 854 #define DO_CLZ_H(N) (clz32(N) - 16) 855 856 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 857 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 858 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 859 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 860 861 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 862 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 863 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 864 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 865 866 #define DO_CNOT(N) (N == 0) 867 868 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 869 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 870 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 871 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 872 873 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 874 875 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 876 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 877 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 878 879 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 880 881 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 882 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 883 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 884 885 #define DO_NOT(N) (~N) 886 887 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 888 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 889 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 890 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 891 892 #define DO_SXTB(N) ((int8_t)N) 893 #define DO_SXTH(N) ((int16_t)N) 894 #define DO_SXTS(N) ((int32_t)N) 895 #define DO_UXTB(N) ((uint8_t)N) 896 #define DO_UXTH(N) ((uint16_t)N) 897 #define DO_UXTS(N) ((uint32_t)N) 898 899 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 900 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 901 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 902 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 903 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 904 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 905 906 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 907 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 908 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 909 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 910 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 911 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 912 913 #define DO_ABS(N) (N < 0 ? -N : N) 914 915 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 916 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 917 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 918 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 919 920 #define DO_NEG(N) (-N) 921 922 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 923 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 924 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 925 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 926 927 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 928 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 929 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 930 931 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 932 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 933 934 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 935 936 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 937 { 938 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 939 uint64_t *d = vd, *n = vn; 940 uint8_t *pg = vg; 941 942 for (i = 0; i < opr_sz; i += 2) { 943 if (pg[H1(i)] & 1) { 944 uint64_t n0 = n[i + 0]; 945 uint64_t n1 = n[i + 1]; 946 d[i + 0] = n1; 947 d[i + 1] = n0; 948 } 949 } 950 } 951 952 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 953 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 954 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 955 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 956 957 #define DO_SQABS(X) \ 958 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 959 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 960 961 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 962 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 963 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 964 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 965 966 #define DO_SQNEG(X) \ 967 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 968 x_ == min_ ? -min_ - 1 : -x_; }) 969 970 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 971 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 972 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 973 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 974 975 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 976 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 977 978 /* Three-operand expander, unpredicated, in which the third operand is "wide". 979 */ 980 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 981 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 982 { \ 983 intptr_t i, opr_sz = simd_oprsz(desc); \ 984 for (i = 0; i < opr_sz; ) { \ 985 TYPEW mm = *(TYPEW *)(vm + i); \ 986 do { \ 987 TYPE nn = *(TYPE *)(vn + H(i)); \ 988 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 989 i += sizeof(TYPE); \ 990 } while (i & 7); \ 991 } \ 992 } 993 994 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 995 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 996 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 997 998 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 999 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1000 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1001 1002 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1003 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1004 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1005 1006 #undef DO_ZZW 1007 1008 #undef DO_CLS_B 1009 #undef DO_CLS_H 1010 #undef DO_CLZ_B 1011 #undef DO_CLZ_H 1012 #undef DO_CNOT 1013 #undef DO_FABS 1014 #undef DO_FNEG 1015 #undef DO_ABS 1016 #undef DO_NEG 1017 #undef DO_ZPZ 1018 #undef DO_ZPZ_D 1019 1020 /* 1021 * Three-operand expander, unpredicated, in which the two inputs are 1022 * selected from the top or bottom half of the wide column. 1023 */ 1024 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1025 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1026 { \ 1027 intptr_t i, opr_sz = simd_oprsz(desc); \ 1028 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1029 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1030 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1031 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1032 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1033 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1034 } \ 1035 } 1036 1037 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1038 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1039 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1040 1041 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1042 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1043 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1044 1045 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1046 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1047 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1048 1049 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1050 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1051 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1052 1053 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1054 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1055 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1056 1057 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1058 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1059 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1060 1061 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1062 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1063 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1064 1065 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1066 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1067 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1068 1069 /* Note that the multiply cannot overflow, but the doubling can. */ 1070 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1071 { 1072 int16_t val = n * m; 1073 return DO_SQADD_H(val, val); 1074 } 1075 1076 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1077 { 1078 int32_t val = n * m; 1079 return DO_SQADD_S(val, val); 1080 } 1081 1082 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1083 { 1084 int64_t val = n * m; 1085 return do_sqadd_d(val, val); 1086 } 1087 1088 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1089 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1090 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1091 1092 #undef DO_ZZZ_TB 1093 1094 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1095 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1096 { \ 1097 intptr_t i, opr_sz = simd_oprsz(desc); \ 1098 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1099 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1100 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1101 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1102 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1103 } \ 1104 } 1105 1106 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1107 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1108 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1109 1110 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1111 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1112 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1113 1114 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1115 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1116 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1117 1118 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1119 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1120 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1121 1122 #undef DO_ZZZ_WTB 1123 1124 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1125 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1126 { \ 1127 intptr_t i, opr_sz = simd_oprsz(desc); \ 1128 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1129 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1130 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1131 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1132 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1133 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1134 } \ 1135 } 1136 1137 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1138 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1139 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1140 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1141 1142 #undef DO_ZZZ_NTB 1143 1144 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1145 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1146 { \ 1147 intptr_t i, opr_sz = simd_oprsz(desc); \ 1148 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1149 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1150 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1151 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1152 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1153 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1154 } \ 1155 } 1156 1157 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1158 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1159 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1160 1161 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1162 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1163 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1164 1165 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1166 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1167 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1168 1169 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1170 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1171 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1172 1173 #define DO_NMUL(N, M) -(N * M) 1174 1175 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1176 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1177 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1178 1179 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1180 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1181 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1182 1183 #undef DO_ZZZW_ACC 1184 1185 #define DO_XTNB(NAME, TYPE, OP) \ 1186 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1187 { \ 1188 intptr_t i, opr_sz = simd_oprsz(desc); \ 1189 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1190 TYPE nn = *(TYPE *)(vn + i); \ 1191 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1192 *(TYPE *)(vd + i) = nn; \ 1193 } \ 1194 } 1195 1196 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1197 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1198 { \ 1199 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1200 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1201 TYPE nn = *(TYPE *)(vn + i); \ 1202 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1203 } \ 1204 } 1205 1206 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1207 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1208 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1209 1210 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1211 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1212 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1213 1214 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1215 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1216 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1217 1218 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1219 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1220 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1221 1222 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1223 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1224 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1225 1226 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1227 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1228 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1229 1230 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1231 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1232 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1233 1234 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1235 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1236 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1237 1238 #undef DO_XTNB 1239 #undef DO_XTNT 1240 1241 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1242 { 1243 intptr_t i, opr_sz = simd_oprsz(desc); 1244 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1245 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1246 uint32_t *a = va, *n = vn; 1247 uint64_t *d = vd, *m = vm; 1248 1249 for (i = 0; i < opr_sz / 8; ++i) { 1250 uint32_t e1 = a[2 * i + H4(0)]; 1251 uint32_t e2 = n[2 * i + sel] ^ inv; 1252 uint64_t c = extract64(m[i], 32, 1); 1253 /* Compute and store the entire 33-bit result at once. */ 1254 d[i] = c + e1 + e2; 1255 } 1256 } 1257 1258 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1259 { 1260 intptr_t i, opr_sz = simd_oprsz(desc); 1261 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1262 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1263 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1264 1265 for (i = 0; i < opr_sz / 8; i += 2) { 1266 Int128 e1 = int128_make64(a[i]); 1267 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1268 Int128 c = int128_make64(m[i + 1] & 1); 1269 Int128 r = int128_add(int128_add(e1, e2), c); 1270 d[i + 0] = int128_getlo(r); 1271 d[i + 1] = int128_gethi(r); 1272 } 1273 } 1274 1275 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1276 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1277 { \ 1278 intptr_t i, opr_sz = simd_oprsz(desc); \ 1279 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1280 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1281 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1282 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1283 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1284 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1285 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1286 } \ 1287 } 1288 1289 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1290 do_sqdmull_h, DO_SQADD_H) 1291 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1292 do_sqdmull_s, DO_SQADD_S) 1293 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1294 do_sqdmull_d, do_sqadd_d) 1295 1296 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1297 do_sqdmull_h, DO_SQSUB_H) 1298 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1299 do_sqdmull_s, DO_SQSUB_S) 1300 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1301 do_sqdmull_d, do_sqsub_d) 1302 1303 #undef DO_SQDMLAL 1304 1305 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1306 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1307 { \ 1308 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1309 int rot = simd_data(desc); \ 1310 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1311 bool sub_r = rot == 1 || rot == 2; \ 1312 bool sub_i = rot >= 2; \ 1313 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1314 for (i = 0; i < opr_sz; i += 2) { \ 1315 TYPE elt1_a = n[H(i + sel_a)]; \ 1316 TYPE elt2_a = m[H(i + sel_a)]; \ 1317 TYPE elt2_b = m[H(i + sel_b)]; \ 1318 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1319 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1320 } \ 1321 } 1322 1323 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1324 1325 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1326 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1327 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1328 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1329 1330 #define DO_SQRDMLAH_B(N, M, A, S) \ 1331 do_sqrdmlah_b(N, M, A, S, true) 1332 #define DO_SQRDMLAH_H(N, M, A, S) \ 1333 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1334 #define DO_SQRDMLAH_S(N, M, A, S) \ 1335 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1336 #define DO_SQRDMLAH_D(N, M, A, S) \ 1337 do_sqrdmlah_d(N, M, A, S, true) 1338 1339 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1340 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1341 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1342 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1343 1344 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1345 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1346 { \ 1347 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1348 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1349 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1350 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1351 bool sub_r = rot == 1 || rot == 2; \ 1352 bool sub_i = rot >= 2; \ 1353 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1354 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1355 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1356 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1357 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1358 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1359 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1360 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1361 } \ 1362 } \ 1363 } 1364 1365 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1366 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1367 1368 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1369 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1370 1371 #undef DO_CMLA 1372 #undef DO_CMLA_FUNC 1373 #undef DO_CMLA_IDX_FUNC 1374 #undef DO_SQRDMLAH_B 1375 #undef DO_SQRDMLAH_H 1376 #undef DO_SQRDMLAH_S 1377 #undef DO_SQRDMLAH_D 1378 1379 /* Note N and M are 4 elements bundled into one unit. */ 1380 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1381 int sel_a, int sel_b, int sub_i) 1382 { 1383 for (int i = 0; i <= 1; i++) { 1384 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1385 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1386 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1387 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1388 1389 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1390 } 1391 return a; 1392 } 1393 1394 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1395 int sel_a, int sel_b, int sub_i) 1396 { 1397 for (int i = 0; i <= 1; i++) { 1398 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1399 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1400 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1401 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1402 1403 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1404 } 1405 return a; 1406 } 1407 1408 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1409 void *va, uint32_t desc) 1410 { 1411 int opr_sz = simd_oprsz(desc); 1412 int rot = simd_data(desc); 1413 int sel_a = rot & 1; 1414 int sel_b = sel_a ^ 1; 1415 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1416 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1417 1418 for (int e = 0; e < opr_sz / 4; e++) { 1419 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1420 } 1421 } 1422 1423 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1424 void *va, uint32_t desc) 1425 { 1426 int opr_sz = simd_oprsz(desc); 1427 int rot = simd_data(desc); 1428 int sel_a = rot & 1; 1429 int sel_b = sel_a ^ 1; 1430 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1431 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1432 1433 for (int e = 0; e < opr_sz / 8; e++) { 1434 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1435 } 1436 } 1437 1438 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1439 void *va, uint32_t desc) 1440 { 1441 int opr_sz = simd_oprsz(desc); 1442 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1443 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1444 int sel_a = rot & 1; 1445 int sel_b = sel_a ^ 1; 1446 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1447 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1448 1449 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1450 uint32_t seg_m = m[seg + idx]; 1451 for (int e = 0; e < 4; e++) { 1452 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1453 sel_a, sel_b, sub_i); 1454 } 1455 } 1456 } 1457 1458 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1459 void *va, uint32_t desc) 1460 { 1461 int seg, opr_sz = simd_oprsz(desc); 1462 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1463 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1464 int sel_a = rot & 1; 1465 int sel_b = sel_a ^ 1; 1466 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1467 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1468 1469 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1470 uint64_t seg_m = m[seg + idx]; 1471 for (int e = 0; e < 2; e++) { 1472 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1473 sel_a, sel_b, sub_i); 1474 } 1475 } 1476 } 1477 1478 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1479 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1480 { \ 1481 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1482 intptr_t i, j, idx = simd_data(desc); \ 1483 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1484 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1485 TYPE mm = m[i]; \ 1486 for (j = 0; j < segment; j++) { \ 1487 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1488 } \ 1489 } \ 1490 } 1491 1492 #define DO_SQRDMLAH_H(N, M, A) \ 1493 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1494 #define DO_SQRDMLAH_S(N, M, A) \ 1495 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1496 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1497 1498 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1499 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1500 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1501 1502 #define DO_SQRDMLSH_H(N, M, A) \ 1503 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1504 #define DO_SQRDMLSH_S(N, M, A) \ 1505 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1506 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1507 1508 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1509 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1510 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1511 1512 #undef DO_ZZXZ 1513 1514 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1515 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1516 { \ 1517 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1518 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1519 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1520 for (i = 0; i < oprsz; i += 16) { \ 1521 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1522 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1523 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1524 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1525 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1526 } \ 1527 } \ 1528 } 1529 1530 #define DO_MLA(N, M, A) (A + N * M) 1531 1532 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1533 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1534 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1535 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1536 1537 #define DO_MLS(N, M, A) (A - N * M) 1538 1539 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1540 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1541 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1542 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1543 1544 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1545 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1546 1547 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1548 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1549 1550 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1551 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1552 1553 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1554 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1555 1556 #undef DO_MLA 1557 #undef DO_MLS 1558 #undef DO_ZZXW 1559 1560 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1561 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1562 { \ 1563 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1564 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1565 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1566 for (i = 0; i < oprsz; i += 16) { \ 1567 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1568 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1569 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1570 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1571 } \ 1572 } \ 1573 } 1574 1575 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1576 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1577 1578 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1579 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1580 1581 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1582 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1583 1584 #undef DO_ZZX 1585 1586 #define DO_BITPERM(NAME, TYPE, OP) \ 1587 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1588 { \ 1589 intptr_t i, opr_sz = simd_oprsz(desc); \ 1590 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1591 TYPE nn = *(TYPE *)(vn + i); \ 1592 TYPE mm = *(TYPE *)(vm + i); \ 1593 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1594 } \ 1595 } 1596 1597 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1598 { 1599 uint64_t res = 0; 1600 int db, rb = 0; 1601 1602 for (db = 0; db < n; ++db) { 1603 if ((mask >> db) & 1) { 1604 res |= ((data >> db) & 1) << rb; 1605 ++rb; 1606 } 1607 } 1608 return res; 1609 } 1610 1611 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1612 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1613 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1614 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1615 1616 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1617 { 1618 uint64_t res = 0; 1619 int rb, db = 0; 1620 1621 for (rb = 0; rb < n; ++rb) { 1622 if ((mask >> rb) & 1) { 1623 res |= ((data >> db) & 1) << rb; 1624 ++db; 1625 } 1626 } 1627 return res; 1628 } 1629 1630 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1631 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1632 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1633 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1634 1635 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1636 { 1637 uint64_t resm = 0, resu = 0; 1638 int db, rbm = 0, rbu = 0; 1639 1640 for (db = 0; db < n; ++db) { 1641 uint64_t val = (data >> db) & 1; 1642 if ((mask >> db) & 1) { 1643 resm |= val << rbm++; 1644 } else { 1645 resu |= val << rbu++; 1646 } 1647 } 1648 1649 return resm | (resu << rbm); 1650 } 1651 1652 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1653 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1654 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1655 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1656 1657 #undef DO_BITPERM 1658 1659 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1660 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1661 { \ 1662 intptr_t i, opr_sz = simd_oprsz(desc); \ 1663 int sub_r = simd_data(desc); \ 1664 if (sub_r) { \ 1665 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1666 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1667 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1668 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1669 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1670 acc_r = ADD_OP(acc_r, el2_i); \ 1671 acc_i = SUB_OP(acc_i, el2_r); \ 1672 *(TYPE *)(vd + H(i)) = acc_r; \ 1673 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1674 } \ 1675 } else { \ 1676 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1677 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1678 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1679 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1680 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1681 acc_r = SUB_OP(acc_r, el2_i); \ 1682 acc_i = ADD_OP(acc_i, el2_r); \ 1683 *(TYPE *)(vd + H(i)) = acc_r; \ 1684 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1685 } \ 1686 } \ 1687 } 1688 1689 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1690 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1691 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1692 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1693 1694 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1695 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1696 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1697 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1698 1699 #undef DO_CADD 1700 1701 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1702 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1703 { \ 1704 intptr_t i, opr_sz = simd_oprsz(desc); \ 1705 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1706 int shift = simd_data(desc) >> 1; \ 1707 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1708 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1709 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1710 } \ 1711 } 1712 1713 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1714 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1715 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1716 1717 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1718 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1719 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1720 1721 #undef DO_ZZI_SHLL 1722 1723 /* Two-operand reduction expander, controlled by a predicate. 1724 * The difference between TYPERED and TYPERET has to do with 1725 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1726 * but TYPERET must be unsigned so that e.g. a 32-bit value 1727 * is not sign-extended to the ABI uint64_t return type. 1728 */ 1729 /* ??? If we were to vectorize this by hand the reduction ordering 1730 * would change. For integer operands, this is perfectly fine. 1731 */ 1732 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1733 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1734 { \ 1735 intptr_t i, opr_sz = simd_oprsz(desc); \ 1736 TYPERED ret = INIT; \ 1737 for (i = 0; i < opr_sz; ) { \ 1738 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1739 do { \ 1740 if (pg & 1) { \ 1741 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1742 ret = OP(ret, nn); \ 1743 } \ 1744 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1745 } while (i & 15); \ 1746 } \ 1747 return (TYPERET)ret; \ 1748 } 1749 1750 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1751 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1752 { \ 1753 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1754 TYPEE *n = vn; \ 1755 uint8_t *pg = vg; \ 1756 TYPER ret = INIT; \ 1757 for (i = 0; i < opr_sz; i += 1) { \ 1758 if (pg[H1(i)] & 1) { \ 1759 TYPEE nn = n[i]; \ 1760 ret = OP(ret, nn); \ 1761 } \ 1762 } \ 1763 return ret; \ 1764 } 1765 1766 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1767 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1768 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1769 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1770 1771 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1772 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1773 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1774 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1775 1776 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1777 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1778 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1779 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1780 1781 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1782 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1783 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1784 1785 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1786 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1787 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1788 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1789 1790 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1791 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1792 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1793 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1794 1795 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1796 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1797 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1798 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1799 1800 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1801 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1802 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1803 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1804 1805 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1806 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1807 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1808 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1809 1810 #undef DO_VPZ 1811 #undef DO_VPZ_D 1812 1813 /* Two vector operand, one scalar operand, unpredicated. */ 1814 #define DO_ZZI(NAME, TYPE, OP) \ 1815 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1816 { \ 1817 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1818 TYPE s = s64, *d = vd, *n = vn; \ 1819 for (i = 0; i < opr_sz; ++i) { \ 1820 d[i] = OP(n[i], s); \ 1821 } \ 1822 } 1823 1824 #define DO_SUBR(X, Y) (Y - X) 1825 1826 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1827 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1828 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1829 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1830 1831 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1832 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1833 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1834 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1835 1836 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1837 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1838 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1839 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1840 1841 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1842 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1843 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1844 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1845 1846 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1847 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1848 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1849 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1850 1851 #undef DO_ZZI 1852 1853 #undef DO_AND 1854 #undef DO_ORR 1855 #undef DO_EOR 1856 #undef DO_BIC 1857 #undef DO_ADD 1858 #undef DO_SUB 1859 #undef DO_MAX 1860 #undef DO_MIN 1861 #undef DO_ABD 1862 #undef DO_MUL 1863 #undef DO_DIV 1864 #undef DO_ASR 1865 #undef DO_LSR 1866 #undef DO_LSL 1867 #undef DO_SUBR 1868 1869 /* Similar to the ARM LastActiveElement pseudocode function, except the 1870 result is multiplied by the element size. This includes the not found 1871 indication; e.g. not found for esz=3 is -8. */ 1872 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1873 { 1874 uint64_t mask = pred_esz_masks[esz]; 1875 intptr_t i = words; 1876 1877 do { 1878 uint64_t this_g = g[--i] & mask; 1879 if (this_g) { 1880 return i * 64 + (63 - clz64(this_g)); 1881 } 1882 } while (i > 0); 1883 return (intptr_t)-1 << esz; 1884 } 1885 1886 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1887 { 1888 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1889 uint32_t flags = PREDTEST_INIT; 1890 uint64_t *d = vd, *g = vg; 1891 intptr_t i = 0; 1892 1893 do { 1894 uint64_t this_d = d[i]; 1895 uint64_t this_g = g[i]; 1896 1897 if (this_g) { 1898 if (!(flags & 4)) { 1899 /* Set in D the first bit of G. */ 1900 this_d |= this_g & -this_g; 1901 d[i] = this_d; 1902 } 1903 flags = iter_predtest_fwd(this_d, this_g, flags); 1904 } 1905 } while (++i < words); 1906 1907 return flags; 1908 } 1909 1910 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1911 { 1912 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1913 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1914 uint32_t flags = PREDTEST_INIT; 1915 uint64_t *d = vd, *g = vg, esz_mask; 1916 intptr_t i, next; 1917 1918 next = last_active_element(vd, words, esz) + (1 << esz); 1919 esz_mask = pred_esz_masks[esz]; 1920 1921 /* Similar to the pseudocode for pnext, but scaled by ESZ 1922 so that we find the correct bit. */ 1923 if (next < words * 64) { 1924 uint64_t mask = -1; 1925 1926 if (next & 63) { 1927 mask = ~((1ull << (next & 63)) - 1); 1928 next &= -64; 1929 } 1930 do { 1931 uint64_t this_g = g[next / 64] & esz_mask & mask; 1932 if (this_g != 0) { 1933 next = (next & -64) + ctz64(this_g); 1934 break; 1935 } 1936 next += 64; 1937 mask = -1; 1938 } while (next < words * 64); 1939 } 1940 1941 i = 0; 1942 do { 1943 uint64_t this_d = 0; 1944 if (i == next / 64) { 1945 this_d = 1ull << (next & 63); 1946 } 1947 d[i] = this_d; 1948 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1949 } while (++i < words); 1950 1951 return flags; 1952 } 1953 1954 /* 1955 * Copy Zn into Zd, and store zero into inactive elements. 1956 * If inv, store zeros into the active elements. 1957 */ 1958 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1959 { 1960 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1961 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1962 uint64_t *d = vd, *n = vn; 1963 uint8_t *pg = vg; 1964 1965 for (i = 0; i < opr_sz; i += 1) { 1966 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1967 } 1968 } 1969 1970 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1971 { 1972 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1973 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1974 uint64_t *d = vd, *n = vn; 1975 uint8_t *pg = vg; 1976 1977 for (i = 0; i < opr_sz; i += 1) { 1978 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1979 } 1980 } 1981 1982 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 1983 { 1984 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1985 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1986 uint64_t *d = vd, *n = vn; 1987 uint8_t *pg = vg; 1988 1989 for (i = 0; i < opr_sz; i += 1) { 1990 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 1991 } 1992 } 1993 1994 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 1995 { 1996 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1997 uint64_t *d = vd, *n = vn; 1998 uint8_t *pg = vg; 1999 uint8_t inv = simd_data(desc); 2000 2001 for (i = 0; i < opr_sz; i += 1) { 2002 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2003 } 2004 } 2005 2006 /* Three-operand expander, immediate operand, controlled by a predicate. 2007 */ 2008 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2009 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2010 { \ 2011 intptr_t i, opr_sz = simd_oprsz(desc); \ 2012 TYPE imm = simd_data(desc); \ 2013 for (i = 0; i < opr_sz; ) { \ 2014 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2015 do { \ 2016 if (pg & 1) { \ 2017 TYPE nn = *(TYPE *)(vn + H(i)); \ 2018 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2019 } \ 2020 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2021 } while (i & 15); \ 2022 } \ 2023 } 2024 2025 /* Similarly, specialized for 64-bit operands. */ 2026 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2027 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2028 { \ 2029 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2030 TYPE *d = vd, *n = vn; \ 2031 TYPE imm = simd_data(desc); \ 2032 uint8_t *pg = vg; \ 2033 for (i = 0; i < opr_sz; i += 1) { \ 2034 if (pg[H1(i)] & 1) { \ 2035 TYPE nn = n[i]; \ 2036 d[i] = OP(nn, imm); \ 2037 } \ 2038 } \ 2039 } 2040 2041 #define DO_SHR(N, M) (N >> M) 2042 #define DO_SHL(N, M) (N << M) 2043 2044 /* Arithmetic shift right for division. This rounds negative numbers 2045 toward zero as per signed division. Therefore before shifting, 2046 when N is negative, add 2**M-1. */ 2047 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2048 2049 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2050 { 2051 if (likely(sh < 64)) { 2052 return (x >> sh) + ((x >> (sh - 1)) & 1); 2053 } else if (sh == 64) { 2054 return x >> 63; 2055 } else { 2056 return 0; 2057 } 2058 } 2059 2060 static inline int64_t do_srshr(int64_t x, unsigned sh) 2061 { 2062 if (likely(sh < 64)) { 2063 return (x >> sh) + ((x >> (sh - 1)) & 1); 2064 } else { 2065 /* Rounding the sign bit always produces 0. */ 2066 return 0; 2067 } 2068 } 2069 2070 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2071 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2072 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2073 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2074 2075 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2076 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2077 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2078 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2079 2080 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2081 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2082 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2083 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2084 2085 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2086 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2087 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2088 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2089 2090 /* SVE2 bitwise shift by immediate */ 2091 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2092 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2093 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2094 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2095 2096 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2097 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2098 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2099 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2100 2101 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2102 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2103 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2104 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2105 2106 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2107 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2108 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2109 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2110 2111 #define do_suqrshl_b(n, m) \ 2112 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2113 #define do_suqrshl_h(n, m) \ 2114 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2115 #define do_suqrshl_s(n, m) \ 2116 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2117 #define do_suqrshl_d(n, m) \ 2118 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2119 2120 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2121 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2122 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2123 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2124 2125 #undef DO_ASRD 2126 #undef DO_ZPZI 2127 #undef DO_ZPZI_D 2128 2129 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2130 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2131 { \ 2132 intptr_t i, opr_sz = simd_oprsz(desc); \ 2133 int shift = simd_data(desc); \ 2134 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2135 TYPEW nn = *(TYPEW *)(vn + i); \ 2136 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2137 } \ 2138 } 2139 2140 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2141 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2142 { \ 2143 intptr_t i, opr_sz = simd_oprsz(desc); \ 2144 int shift = simd_data(desc); \ 2145 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2146 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2147 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2148 } \ 2149 } 2150 2151 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2152 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2153 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2154 2155 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2156 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2157 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2158 2159 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2160 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2161 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2162 2163 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2164 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2165 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2166 2167 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2168 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2169 #define DO_SQSHRUN_D(x, sh) \ 2170 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2171 2172 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2173 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2174 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2175 2176 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2177 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2178 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2179 2180 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2181 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2182 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2183 2184 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2185 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2186 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2187 2188 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2189 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2190 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2191 2192 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2193 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2194 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2195 2196 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2197 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2198 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2199 2200 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2201 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2202 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2203 2204 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2205 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2206 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2207 2208 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2209 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2210 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2211 2212 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2213 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2214 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2215 2216 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2217 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2218 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2219 2220 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2221 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2222 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2223 2224 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2225 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2226 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2227 2228 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2229 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2230 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2231 2232 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2233 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2234 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2235 2236 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2237 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2238 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2239 2240 #undef DO_SHRNB 2241 #undef DO_SHRNT 2242 2243 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2244 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2245 { \ 2246 intptr_t i, opr_sz = simd_oprsz(desc); \ 2247 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2248 TYPEW nn = *(TYPEW *)(vn + i); \ 2249 TYPEW mm = *(TYPEW *)(vm + i); \ 2250 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2251 } \ 2252 } 2253 2254 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2255 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2256 { \ 2257 intptr_t i, opr_sz = simd_oprsz(desc); \ 2258 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2259 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2260 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2261 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2262 } \ 2263 } 2264 2265 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2266 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2267 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2268 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2269 2270 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2271 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2272 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2273 2274 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2275 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2276 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2277 2278 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2279 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2280 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2281 2282 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2283 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2284 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2285 2286 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2287 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2288 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2289 2290 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2291 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2292 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2293 2294 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2295 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2296 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2297 2298 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2299 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2300 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2301 2302 #undef DO_RSUBHN 2303 #undef DO_SUBHN 2304 #undef DO_RADDHN 2305 #undef DO_ADDHN 2306 2307 #undef DO_BINOPNB 2308 2309 /* Fully general four-operand expander, controlled by a predicate. 2310 */ 2311 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2312 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2313 void *vg, uint32_t desc) \ 2314 { \ 2315 intptr_t i, opr_sz = simd_oprsz(desc); \ 2316 for (i = 0; i < opr_sz; ) { \ 2317 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2318 do { \ 2319 if (pg & 1) { \ 2320 TYPE nn = *(TYPE *)(vn + H(i)); \ 2321 TYPE mm = *(TYPE *)(vm + H(i)); \ 2322 TYPE aa = *(TYPE *)(va + H(i)); \ 2323 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2324 } \ 2325 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2326 } while (i & 15); \ 2327 } \ 2328 } 2329 2330 /* Similarly, specialized for 64-bit operands. */ 2331 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2332 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2333 void *vg, uint32_t desc) \ 2334 { \ 2335 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2336 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2337 uint8_t *pg = vg; \ 2338 for (i = 0; i < opr_sz; i += 1) { \ 2339 if (pg[H1(i)] & 1) { \ 2340 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2341 d[i] = OP(aa, nn, mm); \ 2342 } \ 2343 } \ 2344 } 2345 2346 #define DO_MLA(A, N, M) (A + N * M) 2347 #define DO_MLS(A, N, M) (A - N * M) 2348 2349 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2350 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2351 2352 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2353 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2354 2355 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2356 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2357 2358 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2359 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2360 2361 #undef DO_MLA 2362 #undef DO_MLS 2363 #undef DO_ZPZZZ 2364 #undef DO_ZPZZZ_D 2365 2366 void HELPER(sve_index_b)(void *vd, uint32_t start, 2367 uint32_t incr, uint32_t desc) 2368 { 2369 intptr_t i, opr_sz = simd_oprsz(desc); 2370 uint8_t *d = vd; 2371 for (i = 0; i < opr_sz; i += 1) { 2372 d[H1(i)] = start + i * incr; 2373 } 2374 } 2375 2376 void HELPER(sve_index_h)(void *vd, uint32_t start, 2377 uint32_t incr, uint32_t desc) 2378 { 2379 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2380 uint16_t *d = vd; 2381 for (i = 0; i < opr_sz; i += 1) { 2382 d[H2(i)] = start + i * incr; 2383 } 2384 } 2385 2386 void HELPER(sve_index_s)(void *vd, uint32_t start, 2387 uint32_t incr, uint32_t desc) 2388 { 2389 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2390 uint32_t *d = vd; 2391 for (i = 0; i < opr_sz; i += 1) { 2392 d[H4(i)] = start + i * incr; 2393 } 2394 } 2395 2396 void HELPER(sve_index_d)(void *vd, uint64_t start, 2397 uint64_t incr, uint32_t desc) 2398 { 2399 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2400 uint64_t *d = vd; 2401 for (i = 0; i < opr_sz; i += 1) { 2402 d[i] = start + i * incr; 2403 } 2404 } 2405 2406 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2407 { 2408 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2409 uint32_t sh = simd_data(desc); 2410 uint32_t *d = vd, *n = vn, *m = vm; 2411 for (i = 0; i < opr_sz; i += 1) { 2412 d[i] = n[i] + (m[i] << sh); 2413 } 2414 } 2415 2416 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2417 { 2418 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2419 uint64_t sh = simd_data(desc); 2420 uint64_t *d = vd, *n = vn, *m = vm; 2421 for (i = 0; i < opr_sz; i += 1) { 2422 d[i] = n[i] + (m[i] << sh); 2423 } 2424 } 2425 2426 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2427 { 2428 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2429 uint64_t sh = simd_data(desc); 2430 uint64_t *d = vd, *n = vn, *m = vm; 2431 for (i = 0; i < opr_sz; i += 1) { 2432 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2433 } 2434 } 2435 2436 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2437 { 2438 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2439 uint64_t sh = simd_data(desc); 2440 uint64_t *d = vd, *n = vn, *m = vm; 2441 for (i = 0; i < opr_sz; i += 1) { 2442 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2443 } 2444 } 2445 2446 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2447 { 2448 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2449 static const uint16_t coeff[] = { 2450 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2451 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2452 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2453 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2454 }; 2455 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2456 uint16_t *d = vd, *n = vn; 2457 2458 for (i = 0; i < opr_sz; i++) { 2459 uint16_t nn = n[i]; 2460 intptr_t idx = extract32(nn, 0, 5); 2461 uint16_t exp = extract32(nn, 5, 5); 2462 d[i] = coeff[idx] | (exp << 10); 2463 } 2464 } 2465 2466 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2467 { 2468 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2469 static const uint32_t coeff[] = { 2470 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2471 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2472 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2473 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2474 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2475 0x1ef532, 0x20b051, 0x227043, 0x243516, 2476 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2477 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2478 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2479 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2480 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2481 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2482 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2483 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2484 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2485 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2486 }; 2487 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2488 uint32_t *d = vd, *n = vn; 2489 2490 for (i = 0; i < opr_sz; i++) { 2491 uint32_t nn = n[i]; 2492 intptr_t idx = extract32(nn, 0, 6); 2493 uint32_t exp = extract32(nn, 6, 8); 2494 d[i] = coeff[idx] | (exp << 23); 2495 } 2496 } 2497 2498 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2499 { 2500 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2501 static const uint64_t coeff[] = { 2502 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2503 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2504 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2505 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2506 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2507 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2508 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2509 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2510 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2511 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2512 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2513 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2514 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2515 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2516 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2517 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2518 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2519 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2520 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2521 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2522 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2523 0xFA7C1819E90D8ull, 2524 }; 2525 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2526 uint64_t *d = vd, *n = vn; 2527 2528 for (i = 0; i < opr_sz; i++) { 2529 uint64_t nn = n[i]; 2530 intptr_t idx = extract32(nn, 0, 6); 2531 uint64_t exp = extract32(nn, 6, 11); 2532 d[i] = coeff[idx] | (exp << 52); 2533 } 2534 } 2535 2536 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2537 { 2538 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2539 uint16_t *d = vd, *n = vn, *m = vm; 2540 for (i = 0; i < opr_sz; i += 1) { 2541 uint16_t nn = n[i]; 2542 uint16_t mm = m[i]; 2543 if (mm & 1) { 2544 nn = float16_one; 2545 } 2546 d[i] = nn ^ (mm & 2) << 14; 2547 } 2548 } 2549 2550 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2551 { 2552 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2553 uint32_t *d = vd, *n = vn, *m = vm; 2554 for (i = 0; i < opr_sz; i += 1) { 2555 uint32_t nn = n[i]; 2556 uint32_t mm = m[i]; 2557 if (mm & 1) { 2558 nn = float32_one; 2559 } 2560 d[i] = nn ^ (mm & 2) << 30; 2561 } 2562 } 2563 2564 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2565 { 2566 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2567 uint64_t *d = vd, *n = vn, *m = vm; 2568 for (i = 0; i < opr_sz; i += 1) { 2569 uint64_t nn = n[i]; 2570 uint64_t mm = m[i]; 2571 if (mm & 1) { 2572 nn = float64_one; 2573 } 2574 d[i] = nn ^ (mm & 2) << 62; 2575 } 2576 } 2577 2578 /* 2579 * Signed saturating addition with scalar operand. 2580 */ 2581 2582 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2583 { 2584 intptr_t i, oprsz = simd_oprsz(desc); 2585 2586 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2587 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2588 } 2589 } 2590 2591 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2592 { 2593 intptr_t i, oprsz = simd_oprsz(desc); 2594 2595 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2596 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2597 } 2598 } 2599 2600 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2601 { 2602 intptr_t i, oprsz = simd_oprsz(desc); 2603 2604 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2605 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2606 } 2607 } 2608 2609 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2610 { 2611 intptr_t i, oprsz = simd_oprsz(desc); 2612 2613 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2614 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2615 } 2616 } 2617 2618 /* 2619 * Unsigned saturating addition with scalar operand. 2620 */ 2621 2622 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2623 { 2624 intptr_t i, oprsz = simd_oprsz(desc); 2625 2626 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2627 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2628 } 2629 } 2630 2631 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2632 { 2633 intptr_t i, oprsz = simd_oprsz(desc); 2634 2635 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2636 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2637 } 2638 } 2639 2640 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2641 { 2642 intptr_t i, oprsz = simd_oprsz(desc); 2643 2644 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2645 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2646 } 2647 } 2648 2649 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2650 { 2651 intptr_t i, oprsz = simd_oprsz(desc); 2652 2653 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2654 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2655 } 2656 } 2657 2658 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2659 { 2660 intptr_t i, oprsz = simd_oprsz(desc); 2661 2662 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2663 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2664 } 2665 } 2666 2667 /* Two operand predicated copy immediate with merge. All valid immediates 2668 * can fit within 17 signed bits in the simd_data field. 2669 */ 2670 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2671 uint64_t mm, uint32_t desc) 2672 { 2673 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2674 uint64_t *d = vd, *n = vn; 2675 uint8_t *pg = vg; 2676 2677 mm = dup_const(MO_8, mm); 2678 for (i = 0; i < opr_sz; i += 1) { 2679 uint64_t nn = n[i]; 2680 uint64_t pp = expand_pred_b(pg[H1(i)]); 2681 d[i] = (mm & pp) | (nn & ~pp); 2682 } 2683 } 2684 2685 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2686 uint64_t mm, uint32_t desc) 2687 { 2688 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2689 uint64_t *d = vd, *n = vn; 2690 uint8_t *pg = vg; 2691 2692 mm = dup_const(MO_16, mm); 2693 for (i = 0; i < opr_sz; i += 1) { 2694 uint64_t nn = n[i]; 2695 uint64_t pp = expand_pred_h(pg[H1(i)]); 2696 d[i] = (mm & pp) | (nn & ~pp); 2697 } 2698 } 2699 2700 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2701 uint64_t mm, uint32_t desc) 2702 { 2703 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2704 uint64_t *d = vd, *n = vn; 2705 uint8_t *pg = vg; 2706 2707 mm = dup_const(MO_32, mm); 2708 for (i = 0; i < opr_sz; i += 1) { 2709 uint64_t nn = n[i]; 2710 uint64_t pp = expand_pred_s(pg[H1(i)]); 2711 d[i] = (mm & pp) | (nn & ~pp); 2712 } 2713 } 2714 2715 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2716 uint64_t mm, uint32_t desc) 2717 { 2718 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2719 uint64_t *d = vd, *n = vn; 2720 uint8_t *pg = vg; 2721 2722 for (i = 0; i < opr_sz; i += 1) { 2723 uint64_t nn = n[i]; 2724 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2725 } 2726 } 2727 2728 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2729 { 2730 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2731 uint64_t *d = vd; 2732 uint8_t *pg = vg; 2733 2734 val = dup_const(MO_8, val); 2735 for (i = 0; i < opr_sz; i += 1) { 2736 d[i] = val & expand_pred_b(pg[H1(i)]); 2737 } 2738 } 2739 2740 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2741 { 2742 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2743 uint64_t *d = vd; 2744 uint8_t *pg = vg; 2745 2746 val = dup_const(MO_16, val); 2747 for (i = 0; i < opr_sz; i += 1) { 2748 d[i] = val & expand_pred_h(pg[H1(i)]); 2749 } 2750 } 2751 2752 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2753 { 2754 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2755 uint64_t *d = vd; 2756 uint8_t *pg = vg; 2757 2758 val = dup_const(MO_32, val); 2759 for (i = 0; i < opr_sz; i += 1) { 2760 d[i] = val & expand_pred_s(pg[H1(i)]); 2761 } 2762 } 2763 2764 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2765 { 2766 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2767 uint64_t *d = vd; 2768 uint8_t *pg = vg; 2769 2770 for (i = 0; i < opr_sz; i += 1) { 2771 d[i] = (pg[H1(i)] & 1 ? val : 0); 2772 } 2773 } 2774 2775 /* Big-endian hosts need to frob the byte indices. If the copy 2776 * happens to be 8-byte aligned, then no frobbing necessary. 2777 */ 2778 static void swap_memmove(void *vd, void *vs, size_t n) 2779 { 2780 uintptr_t d = (uintptr_t)vd; 2781 uintptr_t s = (uintptr_t)vs; 2782 uintptr_t o = (d | s | n) & 7; 2783 size_t i; 2784 2785 #if !HOST_BIG_ENDIAN 2786 o = 0; 2787 #endif 2788 switch (o) { 2789 case 0: 2790 memmove(vd, vs, n); 2791 break; 2792 2793 case 4: 2794 if (d < s || d >= s + n) { 2795 for (i = 0; i < n; i += 4) { 2796 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2797 } 2798 } else { 2799 for (i = n; i > 0; ) { 2800 i -= 4; 2801 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2802 } 2803 } 2804 break; 2805 2806 case 2: 2807 case 6: 2808 if (d < s || d >= s + n) { 2809 for (i = 0; i < n; i += 2) { 2810 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2811 } 2812 } else { 2813 for (i = n; i > 0; ) { 2814 i -= 2; 2815 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2816 } 2817 } 2818 break; 2819 2820 default: 2821 if (d < s || d >= s + n) { 2822 for (i = 0; i < n; i++) { 2823 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2824 } 2825 } else { 2826 for (i = n; i > 0; ) { 2827 i -= 1; 2828 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2829 } 2830 } 2831 break; 2832 } 2833 } 2834 2835 /* Similarly for memset of 0. */ 2836 static void swap_memzero(void *vd, size_t n) 2837 { 2838 uintptr_t d = (uintptr_t)vd; 2839 uintptr_t o = (d | n) & 7; 2840 size_t i; 2841 2842 /* Usually, the first bit of a predicate is set, so N is 0. */ 2843 if (likely(n == 0)) { 2844 return; 2845 } 2846 2847 #if !HOST_BIG_ENDIAN 2848 o = 0; 2849 #endif 2850 switch (o) { 2851 case 0: 2852 memset(vd, 0, n); 2853 break; 2854 2855 case 4: 2856 for (i = 0; i < n; i += 4) { 2857 *(uint32_t *)H1_4(d + i) = 0; 2858 } 2859 break; 2860 2861 case 2: 2862 case 6: 2863 for (i = 0; i < n; i += 2) { 2864 *(uint16_t *)H1_2(d + i) = 0; 2865 } 2866 break; 2867 2868 default: 2869 for (i = 0; i < n; i++) { 2870 *(uint8_t *)H1(d + i) = 0; 2871 } 2872 break; 2873 } 2874 } 2875 2876 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2877 { 2878 intptr_t opr_sz = simd_oprsz(desc); 2879 size_t n_ofs = simd_data(desc); 2880 size_t n_siz = opr_sz - n_ofs; 2881 2882 if (vd != vm) { 2883 swap_memmove(vd, vn + n_ofs, n_siz); 2884 swap_memmove(vd + n_siz, vm, n_ofs); 2885 } else if (vd != vn) { 2886 swap_memmove(vd + n_siz, vd, n_ofs); 2887 swap_memmove(vd, vn + n_ofs, n_siz); 2888 } else { 2889 /* vd == vn == vm. Need temp space. */ 2890 ARMVectorReg tmp; 2891 swap_memmove(&tmp, vm, n_ofs); 2892 swap_memmove(vd, vd + n_ofs, n_siz); 2893 memcpy(vd + n_siz, &tmp, n_ofs); 2894 } 2895 } 2896 2897 #define DO_INSR(NAME, TYPE, H) \ 2898 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2899 { \ 2900 intptr_t opr_sz = simd_oprsz(desc); \ 2901 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2902 *(TYPE *)(vd + H(0)) = val; \ 2903 } 2904 2905 DO_INSR(sve_insr_b, uint8_t, H1) 2906 DO_INSR(sve_insr_h, uint16_t, H1_2) 2907 DO_INSR(sve_insr_s, uint32_t, H1_4) 2908 DO_INSR(sve_insr_d, uint64_t, H1_8) 2909 2910 #undef DO_INSR 2911 2912 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2913 { 2914 intptr_t i, j, opr_sz = simd_oprsz(desc); 2915 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2916 uint64_t f = *(uint64_t *)(vn + i); 2917 uint64_t b = *(uint64_t *)(vn + j); 2918 *(uint64_t *)(vd + i) = bswap64(b); 2919 *(uint64_t *)(vd + j) = bswap64(f); 2920 } 2921 } 2922 2923 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2924 { 2925 intptr_t i, j, opr_sz = simd_oprsz(desc); 2926 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2927 uint64_t f = *(uint64_t *)(vn + i); 2928 uint64_t b = *(uint64_t *)(vn + j); 2929 *(uint64_t *)(vd + i) = hswap64(b); 2930 *(uint64_t *)(vd + j) = hswap64(f); 2931 } 2932 } 2933 2934 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2935 { 2936 intptr_t i, j, opr_sz = simd_oprsz(desc); 2937 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2938 uint64_t f = *(uint64_t *)(vn + i); 2939 uint64_t b = *(uint64_t *)(vn + j); 2940 *(uint64_t *)(vd + i) = rol64(b, 32); 2941 *(uint64_t *)(vd + j) = rol64(f, 32); 2942 } 2943 } 2944 2945 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2946 { 2947 intptr_t i, j, opr_sz = simd_oprsz(desc); 2948 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2949 uint64_t f = *(uint64_t *)(vn + i); 2950 uint64_t b = *(uint64_t *)(vn + j); 2951 *(uint64_t *)(vd + i) = b; 2952 *(uint64_t *)(vd + j) = f; 2953 } 2954 } 2955 2956 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2957 2958 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2959 bool is_tbx, tb_impl_fn *fn) 2960 { 2961 ARMVectorReg scratch; 2962 uintptr_t oprsz = simd_oprsz(desc); 2963 2964 if (unlikely(vd == vn)) { 2965 vn = memcpy(&scratch, vn, oprsz); 2966 } 2967 2968 fn(vd, vn, NULL, vm, oprsz, is_tbx); 2969 } 2970 2971 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 2972 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 2973 { 2974 ARMVectorReg scratch; 2975 uintptr_t oprsz = simd_oprsz(desc); 2976 2977 if (unlikely(vd == vn0)) { 2978 vn0 = memcpy(&scratch, vn0, oprsz); 2979 if (vd == vn1) { 2980 vn1 = vn0; 2981 } 2982 } else if (unlikely(vd == vn1)) { 2983 vn1 = memcpy(&scratch, vn1, oprsz); 2984 } 2985 2986 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 2987 } 2988 2989 #define DO_TB(SUFF, TYPE, H) \ 2990 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 2991 void *vm, uintptr_t oprsz, bool is_tbx) \ 2992 { \ 2993 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 2994 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 2995 for (i = 0; i < nelem; ++i) { \ 2996 TYPE index = indexes[H1(i)], val = 0; \ 2997 if (index < nelem) { \ 2998 val = tbl0[H(index)]; \ 2999 } else { \ 3000 index -= nelem; \ 3001 if (tbl1 && index < nelem) { \ 3002 val = tbl1[H(index)]; \ 3003 } else if (is_tbx) { \ 3004 continue; \ 3005 } \ 3006 } \ 3007 d[H(i)] = val; \ 3008 } \ 3009 } \ 3010 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3011 { \ 3012 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3013 } \ 3014 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3015 void *vm, uint32_t desc) \ 3016 { \ 3017 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3018 } \ 3019 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3020 { \ 3021 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3022 } 3023 3024 DO_TB(b, uint8_t, H1) 3025 DO_TB(h, uint16_t, H2) 3026 DO_TB(s, uint32_t, H4) 3027 DO_TB(d, uint64_t, H8) 3028 3029 #undef DO_TB 3030 3031 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3032 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3033 { \ 3034 intptr_t i, opr_sz = simd_oprsz(desc); \ 3035 TYPED *d = vd; \ 3036 TYPES *n = vn; \ 3037 ARMVectorReg tmp; \ 3038 if (unlikely(vn - vd < opr_sz)) { \ 3039 n = memcpy(&tmp, n, opr_sz / 2); \ 3040 } \ 3041 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3042 d[HD(i)] = n[HS(i)]; \ 3043 } \ 3044 } 3045 3046 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3047 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3048 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3049 3050 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3051 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3052 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3053 3054 #undef DO_UNPK 3055 3056 /* Mask of bits included in the even numbered predicates of width esz. 3057 * We also use this for expand_bits/compress_bits, and so extend the 3058 * same pattern out to 16-bit units. 3059 */ 3060 static const uint64_t even_bit_esz_masks[5] = { 3061 0x5555555555555555ull, 3062 0x3333333333333333ull, 3063 0x0f0f0f0f0f0f0f0full, 3064 0x00ff00ff00ff00ffull, 3065 0x0000ffff0000ffffull, 3066 }; 3067 3068 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3069 * For N==0, this corresponds to the operation that in qemu/bitops.h 3070 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3071 * section 7-2 Shuffling Bits. 3072 */ 3073 static uint64_t expand_bits(uint64_t x, int n) 3074 { 3075 int i; 3076 3077 x &= 0xffffffffu; 3078 for (i = 4; i >= n; i--) { 3079 int sh = 1 << i; 3080 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3081 } 3082 return x; 3083 } 3084 3085 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3086 * For N==0, this corresponds to the operation that in qemu/bitops.h 3087 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3088 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3089 */ 3090 static uint64_t compress_bits(uint64_t x, int n) 3091 { 3092 int i; 3093 3094 for (i = n; i <= 4; i++) { 3095 int sh = 1 << i; 3096 x &= even_bit_esz_masks[i]; 3097 x = (x >> sh) | x; 3098 } 3099 return x & 0xffffffffu; 3100 } 3101 3102 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3103 { 3104 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3105 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3106 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3107 int esize = 1 << esz; 3108 uint64_t *d = vd; 3109 intptr_t i; 3110 3111 if (oprsz <= 8) { 3112 uint64_t nn = *(uint64_t *)vn; 3113 uint64_t mm = *(uint64_t *)vm; 3114 int half = 4 * oprsz; 3115 3116 nn = extract64(nn, high * half, half); 3117 mm = extract64(mm, high * half, half); 3118 nn = expand_bits(nn, esz); 3119 mm = expand_bits(mm, esz); 3120 d[0] = nn | (mm << esize); 3121 } else { 3122 ARMPredicateReg tmp; 3123 3124 /* We produce output faster than we consume input. 3125 Therefore we must be mindful of possible overlap. */ 3126 if (vd == vn) { 3127 vn = memcpy(&tmp, vn, oprsz); 3128 if (vd == vm) { 3129 vm = vn; 3130 } 3131 } else if (vd == vm) { 3132 vm = memcpy(&tmp, vm, oprsz); 3133 } 3134 if (high) { 3135 high = oprsz >> 1; 3136 } 3137 3138 if ((oprsz & 7) == 0) { 3139 uint32_t *n = vn, *m = vm; 3140 high >>= 2; 3141 3142 for (i = 0; i < oprsz / 8; i++) { 3143 uint64_t nn = n[H4(high + i)]; 3144 uint64_t mm = m[H4(high + i)]; 3145 3146 nn = expand_bits(nn, esz); 3147 mm = expand_bits(mm, esz); 3148 d[i] = nn | (mm << esize); 3149 } 3150 } else { 3151 uint8_t *n = vn, *m = vm; 3152 uint16_t *d16 = vd; 3153 3154 for (i = 0; i < oprsz / 2; i++) { 3155 uint16_t nn = n[H1(high + i)]; 3156 uint16_t mm = m[H1(high + i)]; 3157 3158 nn = expand_bits(nn, esz); 3159 mm = expand_bits(mm, esz); 3160 d16[H2(i)] = nn | (mm << esize); 3161 } 3162 } 3163 } 3164 } 3165 3166 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3167 { 3168 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3169 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3170 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3171 uint64_t *d = vd, *n = vn, *m = vm; 3172 uint64_t l, h; 3173 intptr_t i; 3174 3175 if (oprsz <= 8) { 3176 l = compress_bits(n[0] >> odd, esz); 3177 h = compress_bits(m[0] >> odd, esz); 3178 d[0] = l | (h << (4 * oprsz)); 3179 } else { 3180 ARMPredicateReg tmp_m; 3181 intptr_t oprsz_16 = oprsz / 16; 3182 3183 if ((vm - vd) < (uintptr_t)oprsz) { 3184 m = memcpy(&tmp_m, vm, oprsz); 3185 } 3186 3187 for (i = 0; i < oprsz_16; i++) { 3188 l = n[2 * i + 0]; 3189 h = n[2 * i + 1]; 3190 l = compress_bits(l >> odd, esz); 3191 h = compress_bits(h >> odd, esz); 3192 d[i] = l | (h << 32); 3193 } 3194 3195 /* 3196 * For VL which is not a multiple of 512, the results from M do not 3197 * align nicely with the uint64_t for D. Put the aligned results 3198 * from M into TMP_M and then copy it into place afterward. 3199 */ 3200 if (oprsz & 15) { 3201 int final_shift = (oprsz & 15) * 2; 3202 3203 l = n[2 * i + 0]; 3204 h = n[2 * i + 1]; 3205 l = compress_bits(l >> odd, esz); 3206 h = compress_bits(h >> odd, esz); 3207 d[i] = l | (h << final_shift); 3208 3209 for (i = 0; i < oprsz_16; i++) { 3210 l = m[2 * i + 0]; 3211 h = m[2 * i + 1]; 3212 l = compress_bits(l >> odd, esz); 3213 h = compress_bits(h >> odd, esz); 3214 tmp_m.p[i] = l | (h << 32); 3215 } 3216 l = m[2 * i + 0]; 3217 h = m[2 * i + 1]; 3218 l = compress_bits(l >> odd, esz); 3219 h = compress_bits(h >> odd, esz); 3220 tmp_m.p[i] = l | (h << final_shift); 3221 3222 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3223 } else { 3224 for (i = 0; i < oprsz_16; i++) { 3225 l = m[2 * i + 0]; 3226 h = m[2 * i + 1]; 3227 l = compress_bits(l >> odd, esz); 3228 h = compress_bits(h >> odd, esz); 3229 d[oprsz_16 + i] = l | (h << 32); 3230 } 3231 } 3232 } 3233 } 3234 3235 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3236 { 3237 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3238 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3239 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3240 uint64_t *d = vd, *n = vn, *m = vm; 3241 uint64_t mask; 3242 int shr, shl; 3243 intptr_t i; 3244 3245 shl = 1 << esz; 3246 shr = 0; 3247 mask = even_bit_esz_masks[esz]; 3248 if (odd) { 3249 mask <<= shl; 3250 shr = shl; 3251 shl = 0; 3252 } 3253 3254 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3255 uint64_t nn = (n[i] & mask) >> shr; 3256 uint64_t mm = (m[i] & mask) << shl; 3257 d[i] = nn + mm; 3258 } 3259 } 3260 3261 /* Reverse units of 2**N bits. */ 3262 static uint64_t reverse_bits_64(uint64_t x, int n) 3263 { 3264 int i, sh; 3265 3266 x = bswap64(x); 3267 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3268 uint64_t mask = even_bit_esz_masks[i]; 3269 x = ((x & mask) << sh) | ((x >> sh) & mask); 3270 } 3271 return x; 3272 } 3273 3274 static uint8_t reverse_bits_8(uint8_t x, int n) 3275 { 3276 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3277 int i, sh; 3278 3279 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3280 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3281 } 3282 return x; 3283 } 3284 3285 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3286 { 3287 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3288 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3289 intptr_t i, oprsz_2 = oprsz / 2; 3290 3291 if (oprsz <= 8) { 3292 uint64_t l = *(uint64_t *)vn; 3293 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3294 *(uint64_t *)vd = l; 3295 } else if ((oprsz & 15) == 0) { 3296 for (i = 0; i < oprsz_2; i += 8) { 3297 intptr_t ih = oprsz - 8 - i; 3298 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3299 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3300 *(uint64_t *)(vd + i) = h; 3301 *(uint64_t *)(vd + ih) = l; 3302 } 3303 } else { 3304 for (i = 0; i < oprsz_2; i += 1) { 3305 intptr_t il = H1(i); 3306 intptr_t ih = H1(oprsz - 1 - i); 3307 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3308 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3309 *(uint8_t *)(vd + il) = h; 3310 *(uint8_t *)(vd + ih) = l; 3311 } 3312 } 3313 } 3314 3315 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3316 { 3317 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3318 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3319 uint64_t *d = vd; 3320 intptr_t i; 3321 3322 if (oprsz <= 8) { 3323 uint64_t nn = *(uint64_t *)vn; 3324 int half = 4 * oprsz; 3325 3326 nn = extract64(nn, high * half, half); 3327 nn = expand_bits(nn, 0); 3328 d[0] = nn; 3329 } else { 3330 ARMPredicateReg tmp_n; 3331 3332 /* We produce output faster than we consume input. 3333 Therefore we must be mindful of possible overlap. */ 3334 if ((vn - vd) < (uintptr_t)oprsz) { 3335 vn = memcpy(&tmp_n, vn, oprsz); 3336 } 3337 if (high) { 3338 high = oprsz >> 1; 3339 } 3340 3341 if ((oprsz & 7) == 0) { 3342 uint32_t *n = vn; 3343 high >>= 2; 3344 3345 for (i = 0; i < oprsz / 8; i++) { 3346 uint64_t nn = n[H4(high + i)]; 3347 d[i] = expand_bits(nn, 0); 3348 } 3349 } else { 3350 uint16_t *d16 = vd; 3351 uint8_t *n = vn; 3352 3353 for (i = 0; i < oprsz / 2; i++) { 3354 uint16_t nn = n[H1(high + i)]; 3355 d16[H2(i)] = expand_bits(nn, 0); 3356 } 3357 } 3358 } 3359 } 3360 3361 #define DO_ZIP(NAME, TYPE, H) \ 3362 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3363 { \ 3364 intptr_t oprsz = simd_oprsz(desc); \ 3365 intptr_t odd_ofs = simd_data(desc); \ 3366 intptr_t i, oprsz_2 = oprsz / 2; \ 3367 ARMVectorReg tmp_n, tmp_m; \ 3368 /* We produce output faster than we consume input. \ 3369 Therefore we must be mindful of possible overlap. */ \ 3370 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3371 vn = memcpy(&tmp_n, vn, oprsz); \ 3372 } \ 3373 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3374 vm = memcpy(&tmp_m, vm, oprsz); \ 3375 } \ 3376 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3377 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3378 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3379 *(TYPE *)(vm + odd_ofs + H(i)); \ 3380 } \ 3381 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3382 memset(vd + oprsz - 16, 0, 16); \ 3383 } \ 3384 } 3385 3386 DO_ZIP(sve_zip_b, uint8_t, H1) 3387 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3388 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3389 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3390 DO_ZIP(sve2_zip_q, Int128, ) 3391 3392 #define DO_UZP(NAME, TYPE, H) \ 3393 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3394 { \ 3395 intptr_t oprsz = simd_oprsz(desc); \ 3396 intptr_t odd_ofs = simd_data(desc); \ 3397 intptr_t i, p; \ 3398 ARMVectorReg tmp_m; \ 3399 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3400 vm = memcpy(&tmp_m, vm, oprsz); \ 3401 } \ 3402 i = 0, p = odd_ofs; \ 3403 do { \ 3404 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3405 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3406 } while (p < oprsz); \ 3407 p -= oprsz; \ 3408 do { \ 3409 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3410 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3411 } while (p < oprsz); \ 3412 tcg_debug_assert(i == oprsz); \ 3413 } 3414 3415 DO_UZP(sve_uzp_b, uint8_t, H1) 3416 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3417 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3418 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3419 DO_UZP(sve2_uzp_q, Int128, ) 3420 3421 #define DO_TRN(NAME, TYPE, H) \ 3422 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3423 { \ 3424 intptr_t oprsz = simd_oprsz(desc); \ 3425 intptr_t odd_ofs = simd_data(desc); \ 3426 intptr_t i; \ 3427 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3428 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3429 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3430 *(TYPE *)(vd + H(i + 0)) = ae; \ 3431 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3432 } \ 3433 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3434 memset(vd + oprsz - 16, 0, 16); \ 3435 } \ 3436 } 3437 3438 DO_TRN(sve_trn_b, uint8_t, H1) 3439 DO_TRN(sve_trn_h, uint16_t, H1_2) 3440 DO_TRN(sve_trn_s, uint32_t, H1_4) 3441 DO_TRN(sve_trn_d, uint64_t, H1_8) 3442 DO_TRN(sve2_trn_q, Int128, ) 3443 3444 #undef DO_ZIP 3445 #undef DO_UZP 3446 #undef DO_TRN 3447 3448 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3449 { 3450 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3451 uint32_t *d = vd, *n = vn; 3452 uint8_t *pg = vg; 3453 3454 for (i = j = 0; i < opr_sz; i++) { 3455 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3456 d[H4(j)] = n[H4(i)]; 3457 j++; 3458 } 3459 } 3460 for (; j < opr_sz; j++) { 3461 d[H4(j)] = 0; 3462 } 3463 } 3464 3465 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3466 { 3467 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3468 uint64_t *d = vd, *n = vn; 3469 uint8_t *pg = vg; 3470 3471 for (i = j = 0; i < opr_sz; i++) { 3472 if (pg[H1(i)] & 1) { 3473 d[j] = n[i]; 3474 j++; 3475 } 3476 } 3477 for (; j < opr_sz; j++) { 3478 d[j] = 0; 3479 } 3480 } 3481 3482 /* Similar to the ARM LastActiveElement pseudocode function, except the 3483 * result is multiplied by the element size. This includes the not found 3484 * indication; e.g. not found for esz=3 is -8. 3485 */ 3486 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3487 { 3488 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3489 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3490 3491 return last_active_element(vg, words, esz); 3492 } 3493 3494 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3495 { 3496 intptr_t opr_sz = simd_oprsz(desc) / 8; 3497 int esz = simd_data(desc); 3498 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3499 intptr_t i, first_i, last_i; 3500 ARMVectorReg tmp; 3501 3502 first_i = last_i = 0; 3503 first_g = last_g = 0; 3504 3505 /* Find the extent of the active elements within VG. */ 3506 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3507 pg = *(uint64_t *)(vg + i) & mask; 3508 if (pg) { 3509 if (last_g == 0) { 3510 last_g = pg; 3511 last_i = i; 3512 } 3513 first_g = pg; 3514 first_i = i; 3515 } 3516 } 3517 3518 len = 0; 3519 if (first_g != 0) { 3520 first_i = first_i * 8 + ctz64(first_g); 3521 last_i = last_i * 8 + 63 - clz64(last_g); 3522 len = last_i - first_i + (1 << esz); 3523 if (vd == vm) { 3524 vm = memcpy(&tmp, vm, opr_sz * 8); 3525 } 3526 swap_memmove(vd, vn + first_i, len); 3527 } 3528 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3529 } 3530 3531 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3532 void *vg, uint32_t desc) 3533 { 3534 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3535 uint64_t *d = vd, *n = vn, *m = vm; 3536 uint8_t *pg = vg; 3537 3538 for (i = 0; i < opr_sz; i += 1) { 3539 uint64_t nn = n[i], mm = m[i]; 3540 uint64_t pp = expand_pred_b(pg[H1(i)]); 3541 d[i] = (nn & pp) | (mm & ~pp); 3542 } 3543 } 3544 3545 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3546 void *vg, uint32_t desc) 3547 { 3548 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3549 uint64_t *d = vd, *n = vn, *m = vm; 3550 uint8_t *pg = vg; 3551 3552 for (i = 0; i < opr_sz; i += 1) { 3553 uint64_t nn = n[i], mm = m[i]; 3554 uint64_t pp = expand_pred_h(pg[H1(i)]); 3555 d[i] = (nn & pp) | (mm & ~pp); 3556 } 3557 } 3558 3559 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3560 void *vg, uint32_t desc) 3561 { 3562 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3563 uint64_t *d = vd, *n = vn, *m = vm; 3564 uint8_t *pg = vg; 3565 3566 for (i = 0; i < opr_sz; i += 1) { 3567 uint64_t nn = n[i], mm = m[i]; 3568 uint64_t pp = expand_pred_s(pg[H1(i)]); 3569 d[i] = (nn & pp) | (mm & ~pp); 3570 } 3571 } 3572 3573 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3574 void *vg, uint32_t desc) 3575 { 3576 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3577 uint64_t *d = vd, *n = vn, *m = vm; 3578 uint8_t *pg = vg; 3579 3580 for (i = 0; i < opr_sz; i += 1) { 3581 uint64_t nn = n[i], mm = m[i]; 3582 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3583 } 3584 } 3585 3586 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3587 void *vg, uint32_t desc) 3588 { 3589 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3590 Int128 *d = vd, *n = vn, *m = vm; 3591 uint16_t *pg = vg; 3592 3593 for (i = 0; i < opr_sz; i += 1) { 3594 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3595 } 3596 } 3597 3598 /* Two operand comparison controlled by a predicate. 3599 * ??? It is very tempting to want to be able to expand this inline 3600 * with x86 instructions, e.g. 3601 * 3602 * vcmpeqw zm, zn, %ymm0 3603 * vpmovmskb %ymm0, %eax 3604 * and $0x5555, %eax 3605 * and pg, %eax 3606 * 3607 * or even aarch64, e.g. 3608 * 3609 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3610 * cmeq v0.8h, zn, zm 3611 * and v0.8h, v0.8h, mask 3612 * addv h0, v0.8h 3613 * and v0.8b, pg 3614 * 3615 * However, coming up with an abstraction that allows vector inputs and 3616 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3617 * scalar outputs, is tricky. 3618 */ 3619 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3620 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3621 { \ 3622 intptr_t opr_sz = simd_oprsz(desc); \ 3623 uint32_t flags = PREDTEST_INIT; \ 3624 intptr_t i = opr_sz; \ 3625 do { \ 3626 uint64_t out = 0, pg; \ 3627 do { \ 3628 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3629 TYPE nn = *(TYPE *)(vn + H(i)); \ 3630 TYPE mm = *(TYPE *)(vm + H(i)); \ 3631 out |= nn OP mm; \ 3632 } while (i & 63); \ 3633 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3634 out &= pg; \ 3635 *(uint64_t *)(vd + (i >> 3)) = out; \ 3636 flags = iter_predtest_bwd(out, pg, flags); \ 3637 } while (i > 0); \ 3638 return flags; \ 3639 } 3640 3641 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3642 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3643 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3644 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3645 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3646 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3647 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3648 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3649 3650 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3651 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3652 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3653 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3654 3655 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3656 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3657 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3658 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3659 3660 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3661 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3662 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3663 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3664 3665 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3666 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3667 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3668 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3669 3670 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3671 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3672 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3673 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3674 3675 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3676 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3677 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3678 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3679 3680 #undef DO_CMP_PPZZ_B 3681 #undef DO_CMP_PPZZ_H 3682 #undef DO_CMP_PPZZ_S 3683 #undef DO_CMP_PPZZ_D 3684 #undef DO_CMP_PPZZ 3685 3686 /* Similar, but the second source is "wide". */ 3687 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3688 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3689 { \ 3690 intptr_t opr_sz = simd_oprsz(desc); \ 3691 uint32_t flags = PREDTEST_INIT; \ 3692 intptr_t i = opr_sz; \ 3693 do { \ 3694 uint64_t out = 0, pg; \ 3695 do { \ 3696 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3697 do { \ 3698 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3699 TYPE nn = *(TYPE *)(vn + H(i)); \ 3700 out |= nn OP mm; \ 3701 } while (i & 7); \ 3702 } while (i & 63); \ 3703 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3704 out &= pg; \ 3705 *(uint64_t *)(vd + (i >> 3)) = out; \ 3706 flags = iter_predtest_bwd(out, pg, flags); \ 3707 } while (i > 0); \ 3708 return flags; \ 3709 } 3710 3711 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3712 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3713 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3714 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3715 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3716 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3717 3718 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3719 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3720 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3721 3722 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3723 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3724 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3725 3726 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3727 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3728 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3729 3730 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3731 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3732 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3733 3734 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3735 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3736 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3737 3738 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3739 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3740 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3741 3742 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3743 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3744 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3745 3746 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3747 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3748 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3749 3750 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3751 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3752 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3753 3754 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3755 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3756 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3757 3758 #undef DO_CMP_PPZW_B 3759 #undef DO_CMP_PPZW_H 3760 #undef DO_CMP_PPZW_S 3761 #undef DO_CMP_PPZW 3762 3763 /* Similar, but the second source is immediate. */ 3764 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3765 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3766 { \ 3767 intptr_t opr_sz = simd_oprsz(desc); \ 3768 uint32_t flags = PREDTEST_INIT; \ 3769 TYPE mm = simd_data(desc); \ 3770 intptr_t i = opr_sz; \ 3771 do { \ 3772 uint64_t out = 0, pg; \ 3773 do { \ 3774 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3775 TYPE nn = *(TYPE *)(vn + H(i)); \ 3776 out |= nn OP mm; \ 3777 } while (i & 63); \ 3778 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3779 out &= pg; \ 3780 *(uint64_t *)(vd + (i >> 3)) = out; \ 3781 flags = iter_predtest_bwd(out, pg, flags); \ 3782 } while (i > 0); \ 3783 return flags; \ 3784 } 3785 3786 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3787 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3788 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3789 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3790 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3791 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3792 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3793 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3794 3795 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3796 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3797 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3798 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3799 3800 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3801 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3802 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3803 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3804 3805 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3806 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3807 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3808 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3809 3810 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3811 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3812 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3813 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3814 3815 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3816 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3817 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3818 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3819 3820 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3821 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3822 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3823 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3824 3825 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3826 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3827 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3828 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3829 3830 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3831 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3832 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3833 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3834 3835 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3836 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3837 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3838 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3839 3840 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3841 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3842 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3843 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3844 3845 #undef DO_CMP_PPZI_B 3846 #undef DO_CMP_PPZI_H 3847 #undef DO_CMP_PPZI_S 3848 #undef DO_CMP_PPZI_D 3849 #undef DO_CMP_PPZI 3850 3851 /* Similar to the ARM LastActive pseudocode function. */ 3852 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3853 { 3854 intptr_t i; 3855 3856 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3857 uint64_t pg = *(uint64_t *)(vg + i); 3858 if (pg) { 3859 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3860 } 3861 } 3862 return 0; 3863 } 3864 3865 /* Compute a mask into RETB that is true for all G, up to and including 3866 * (if after) or excluding (if !after) the first G & N. 3867 * Return true if BRK found. 3868 */ 3869 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3870 bool brk, bool after) 3871 { 3872 uint64_t b; 3873 3874 if (brk) { 3875 b = 0; 3876 } else if ((g & n) == 0) { 3877 /* For all G, no N are set; break not found. */ 3878 b = g; 3879 } else { 3880 /* Break somewhere in N. Locate it. */ 3881 b = g & n; /* guard true, pred true */ 3882 b = b & -b; /* first such */ 3883 if (after) { 3884 b = b | (b - 1); /* break after same */ 3885 } else { 3886 b = b - 1; /* break before same */ 3887 } 3888 brk = true; 3889 } 3890 3891 *retb = b; 3892 return brk; 3893 } 3894 3895 /* Compute a zeroing BRK. */ 3896 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3897 intptr_t oprsz, bool after) 3898 { 3899 bool brk = false; 3900 intptr_t i; 3901 3902 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3903 uint64_t this_b, this_g = g[i]; 3904 3905 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3906 d[i] = this_b & this_g; 3907 } 3908 } 3909 3910 /* Likewise, but also compute flags. */ 3911 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3912 intptr_t oprsz, bool after) 3913 { 3914 uint32_t flags = PREDTEST_INIT; 3915 bool brk = false; 3916 intptr_t i; 3917 3918 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3919 uint64_t this_b, this_d, this_g = g[i]; 3920 3921 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3922 d[i] = this_d = this_b & this_g; 3923 flags = iter_predtest_fwd(this_d, this_g, flags); 3924 } 3925 return flags; 3926 } 3927 3928 /* Compute a merging BRK. */ 3929 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3930 intptr_t oprsz, bool after) 3931 { 3932 bool brk = false; 3933 intptr_t i; 3934 3935 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3936 uint64_t this_b, this_g = g[i]; 3937 3938 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3939 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3940 } 3941 } 3942 3943 /* Likewise, but also compute flags. */ 3944 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3945 intptr_t oprsz, bool after) 3946 { 3947 uint32_t flags = PREDTEST_INIT; 3948 bool brk = false; 3949 intptr_t i; 3950 3951 for (i = 0; i < oprsz / 8; ++i) { 3952 uint64_t this_b, this_d = d[i], this_g = g[i]; 3953 3954 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3955 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3956 flags = iter_predtest_fwd(this_d, this_g, flags); 3957 } 3958 return flags; 3959 } 3960 3961 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3962 { 3963 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3964 * The compiler should turn this into 4 64-bit integer stores. 3965 */ 3966 memset(d, 0, sizeof(ARMPredicateReg)); 3967 return PREDTEST_INIT; 3968 } 3969 3970 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 3971 uint32_t pred_desc) 3972 { 3973 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3974 if (last_active_pred(vn, vg, oprsz)) { 3975 compute_brk_z(vd, vm, vg, oprsz, true); 3976 } else { 3977 do_zero(vd, oprsz); 3978 } 3979 } 3980 3981 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 3982 uint32_t pred_desc) 3983 { 3984 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3985 if (last_active_pred(vn, vg, oprsz)) { 3986 return compute_brks_z(vd, vm, vg, oprsz, true); 3987 } else { 3988 return do_zero(vd, oprsz); 3989 } 3990 } 3991 3992 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 3993 uint32_t pred_desc) 3994 { 3995 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3996 if (last_active_pred(vn, vg, oprsz)) { 3997 compute_brk_z(vd, vm, vg, oprsz, false); 3998 } else { 3999 do_zero(vd, oprsz); 4000 } 4001 } 4002 4003 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4004 uint32_t pred_desc) 4005 { 4006 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4007 if (last_active_pred(vn, vg, oprsz)) { 4008 return compute_brks_z(vd, vm, vg, oprsz, false); 4009 } else { 4010 return do_zero(vd, oprsz); 4011 } 4012 } 4013 4014 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4015 { 4016 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4017 compute_brk_z(vd, vn, vg, oprsz, true); 4018 } 4019 4020 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4021 { 4022 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4023 return compute_brks_z(vd, vn, vg, oprsz, true); 4024 } 4025 4026 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4027 { 4028 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4029 compute_brk_z(vd, vn, vg, oprsz, false); 4030 } 4031 4032 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4033 { 4034 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4035 return compute_brks_z(vd, vn, vg, oprsz, false); 4036 } 4037 4038 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4039 { 4040 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4041 compute_brk_m(vd, vn, vg, oprsz, true); 4042 } 4043 4044 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4045 { 4046 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4047 return compute_brks_m(vd, vn, vg, oprsz, true); 4048 } 4049 4050 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4051 { 4052 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4053 compute_brk_m(vd, vn, vg, oprsz, false); 4054 } 4055 4056 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4057 { 4058 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4059 return compute_brks_m(vd, vn, vg, oprsz, false); 4060 } 4061 4062 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4063 { 4064 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4065 if (!last_active_pred(vn, vg, oprsz)) { 4066 do_zero(vd, oprsz); 4067 } 4068 } 4069 4070 /* As if PredTest(Ones(PL), D, esz). */ 4071 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4072 uint64_t esz_mask) 4073 { 4074 uint32_t flags = PREDTEST_INIT; 4075 intptr_t i; 4076 4077 for (i = 0; i < oprsz / 8; i++) { 4078 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4079 } 4080 if (oprsz & 7) { 4081 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4082 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4083 } 4084 return flags; 4085 } 4086 4087 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4088 { 4089 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4090 if (last_active_pred(vn, vg, oprsz)) { 4091 return predtest_ones(vd, oprsz, -1); 4092 } else { 4093 return do_zero(vd, oprsz); 4094 } 4095 } 4096 4097 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4098 { 4099 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4100 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4101 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4102 intptr_t i; 4103 4104 for (i = 0; i < words; ++i) { 4105 uint64_t t = n[i] & g[i] & mask; 4106 sum += ctpop64(t); 4107 } 4108 return sum; 4109 } 4110 4111 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4112 { 4113 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4114 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4115 uint64_t esz_mask = pred_esz_masks[esz]; 4116 ARMPredicateReg *d = vd; 4117 uint32_t flags; 4118 intptr_t i; 4119 4120 /* Begin with a zero predicate register. */ 4121 flags = do_zero(d, oprsz); 4122 if (count == 0) { 4123 return flags; 4124 } 4125 4126 /* Set all of the requested bits. */ 4127 for (i = 0; i < count / 64; ++i) { 4128 d->p[i] = esz_mask; 4129 } 4130 if (count & 63) { 4131 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4132 } 4133 4134 return predtest_ones(d, oprsz, esz_mask); 4135 } 4136 4137 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4138 { 4139 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4140 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4141 uint64_t esz_mask = pred_esz_masks[esz]; 4142 ARMPredicateReg *d = vd; 4143 intptr_t i, invcount, oprbits; 4144 uint64_t bits; 4145 4146 if (count == 0) { 4147 return do_zero(d, oprsz); 4148 } 4149 4150 oprbits = oprsz * 8; 4151 tcg_debug_assert(count <= oprbits); 4152 4153 bits = esz_mask; 4154 if (oprbits & 63) { 4155 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4156 } 4157 4158 invcount = oprbits - count; 4159 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4160 d->p[i] = bits; 4161 bits = esz_mask; 4162 } 4163 4164 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4165 4166 while (--i >= 0) { 4167 d->p[i] = 0; 4168 } 4169 4170 return predtest_ones(d, oprsz, esz_mask); 4171 } 4172 4173 /* Recursive reduction on a function; 4174 * C.f. the ARM ARM function ReducePredicated. 4175 * 4176 * While it would be possible to write this without the DATA temporary, 4177 * it is much simpler to process the predicate register this way. 4178 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4179 * little to gain with a more complex non-recursive form. 4180 */ 4181 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4182 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4183 { \ 4184 if (n == 1) { \ 4185 return *data; \ 4186 } else { \ 4187 uintptr_t half = n / 2; \ 4188 TYPE lo = NAME##_reduce(data, status, half); \ 4189 TYPE hi = NAME##_reduce(data + half, status, half); \ 4190 return TYPE##_##FUNC(lo, hi, status); \ 4191 } \ 4192 } \ 4193 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ 4194 { \ 4195 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4196 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4197 for (i = 0; i < oprsz; ) { \ 4198 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4199 do { \ 4200 TYPE nn = *(TYPE *)(vn + H(i)); \ 4201 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4202 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4203 } while (i & 15); \ 4204 } \ 4205 for (; i < maxsz; i += sizeof(TYPE)) { \ 4206 *(TYPE *)((void *)data + i) = IDENT; \ 4207 } \ 4208 return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ 4209 } 4210 4211 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) 4212 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) 4213 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) 4214 4215 /* Identity is floatN_default_nan, without the function call. */ 4216 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) 4217 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) 4218 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) 4219 4220 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) 4221 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) 4222 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) 4223 4224 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) 4225 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) 4226 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) 4227 4228 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) 4229 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) 4230 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) 4231 4232 #undef DO_REDUCE 4233 4234 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4235 float_status *status, uint32_t desc) 4236 { 4237 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4238 float16 result = nn; 4239 4240 do { 4241 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4242 do { 4243 if (pg & 1) { 4244 float16 mm = *(float16 *)(vm + H1_2(i)); 4245 result = float16_add(result, mm, status); 4246 } 4247 i += sizeof(float16), pg >>= sizeof(float16); 4248 } while (i & 15); 4249 } while (i < opr_sz); 4250 4251 return result; 4252 } 4253 4254 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4255 float_status *status, uint32_t desc) 4256 { 4257 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4258 float32 result = nn; 4259 4260 do { 4261 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4262 do { 4263 if (pg & 1) { 4264 float32 mm = *(float32 *)(vm + H1_2(i)); 4265 result = float32_add(result, mm, status); 4266 } 4267 i += sizeof(float32), pg >>= sizeof(float32); 4268 } while (i & 15); 4269 } while (i < opr_sz); 4270 4271 return result; 4272 } 4273 4274 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4275 float_status *status, uint32_t desc) 4276 { 4277 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4278 uint64_t *m = vm; 4279 uint8_t *pg = vg; 4280 4281 for (i = 0; i < opr_sz; i++) { 4282 if (pg[H1(i)] & 1) { 4283 nn = float64_add(nn, m[i], status); 4284 } 4285 } 4286 4287 return nn; 4288 } 4289 4290 /* Fully general three-operand expander, controlled by a predicate, 4291 * With the extra float_status parameter. 4292 */ 4293 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4294 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4295 float_status *status, uint32_t desc) \ 4296 { \ 4297 intptr_t i = simd_oprsz(desc); \ 4298 uint64_t *g = vg; \ 4299 do { \ 4300 uint64_t pg = g[(i - 1) >> 6]; \ 4301 do { \ 4302 i -= sizeof(TYPE); \ 4303 if (likely((pg >> (i & 63)) & 1)) { \ 4304 TYPE nn = *(TYPE *)(vn + H(i)); \ 4305 TYPE mm = *(TYPE *)(vm + H(i)); \ 4306 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4307 } \ 4308 } while (i & 63); \ 4309 } while (i != 0); \ 4310 } 4311 4312 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4313 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4314 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4315 4316 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4317 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4318 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4319 4320 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4321 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4322 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4323 4324 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4325 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4326 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4327 4328 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4329 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4330 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4331 4332 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4333 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4334 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4335 4336 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4337 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4338 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4339 4340 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4341 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4342 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4343 4344 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4345 { 4346 return float16_abs(float16_sub(a, b, s)); 4347 } 4348 4349 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4350 { 4351 return float32_abs(float32_sub(a, b, s)); 4352 } 4353 4354 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4355 { 4356 return float64_abs(float64_sub(a, b, s)); 4357 } 4358 4359 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4360 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4361 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4362 4363 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4364 { 4365 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4366 return float64_scalbn(a, b_int, s); 4367 } 4368 4369 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4370 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4371 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4372 4373 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4374 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4375 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4376 4377 #undef DO_ZPZZ_FP 4378 4379 /* Three-operand expander, with one scalar operand, controlled by 4380 * a predicate, with the extra float_status parameter. 4381 */ 4382 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4383 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4384 float_status *status, uint32_t desc) \ 4385 { \ 4386 intptr_t i = simd_oprsz(desc); \ 4387 uint64_t *g = vg; \ 4388 TYPE mm = scalar; \ 4389 do { \ 4390 uint64_t pg = g[(i - 1) >> 6]; \ 4391 do { \ 4392 i -= sizeof(TYPE); \ 4393 if (likely((pg >> (i & 63)) & 1)) { \ 4394 TYPE nn = *(TYPE *)(vn + H(i)); \ 4395 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4396 } \ 4397 } while (i & 63); \ 4398 } while (i != 0); \ 4399 } 4400 4401 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4402 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4403 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4404 4405 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4406 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4407 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4408 4409 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4410 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4411 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4412 4413 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4414 { 4415 return float16_sub(b, a, s); 4416 } 4417 4418 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4419 { 4420 return float32_sub(b, a, s); 4421 } 4422 4423 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4424 { 4425 return float64_sub(b, a, s); 4426 } 4427 4428 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4429 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4430 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4431 4432 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4433 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4434 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4435 4436 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4437 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4438 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4439 4440 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4441 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4442 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4443 4444 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4445 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4446 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4447 4448 /* Fully general two-operand expander, controlled by a predicate, 4449 * With the extra float_status parameter. 4450 */ 4451 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4452 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4453 float_status *status, uint32_t desc) \ 4454 { \ 4455 intptr_t i = simd_oprsz(desc); \ 4456 uint64_t *g = vg; \ 4457 do { \ 4458 uint64_t pg = g[(i - 1) >> 6]; \ 4459 do { \ 4460 i -= sizeof(TYPE); \ 4461 if (likely((pg >> (i & 63)) & 1)) { \ 4462 TYPE nn = *(TYPE *)(vn + H(i)); \ 4463 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4464 } \ 4465 } while (i & 63); \ 4466 } while (i != 0); \ 4467 } 4468 4469 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4470 * FZ16. When converting from fp16, this affects flushing input denormals; 4471 * when converting to fp16, this affects flushing output denormals. 4472 */ 4473 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4474 { 4475 bool save = get_flush_inputs_to_zero(fpst); 4476 float32 ret; 4477 4478 set_flush_inputs_to_zero(false, fpst); 4479 ret = float16_to_float32(f, true, fpst); 4480 set_flush_inputs_to_zero(save, fpst); 4481 return ret; 4482 } 4483 4484 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4485 { 4486 bool save = get_flush_inputs_to_zero(fpst); 4487 float64 ret; 4488 4489 set_flush_inputs_to_zero(false, fpst); 4490 ret = float16_to_float64(f, true, fpst); 4491 set_flush_inputs_to_zero(save, fpst); 4492 return ret; 4493 } 4494 4495 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4496 { 4497 bool save = get_flush_to_zero(fpst); 4498 float16 ret; 4499 4500 set_flush_to_zero(false, fpst); 4501 ret = float32_to_float16(f, true, fpst); 4502 set_flush_to_zero(save, fpst); 4503 return ret; 4504 } 4505 4506 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4507 { 4508 bool save = get_flush_to_zero(fpst); 4509 float16 ret; 4510 4511 set_flush_to_zero(false, fpst); 4512 ret = float64_to_float16(f, true, fpst); 4513 set_flush_to_zero(save, fpst); 4514 return ret; 4515 } 4516 4517 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4518 { 4519 if (float16_is_any_nan(f)) { 4520 float_raise(float_flag_invalid, s); 4521 return 0; 4522 } 4523 return float16_to_int16_round_to_zero(f, s); 4524 } 4525 4526 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4527 { 4528 if (float16_is_any_nan(f)) { 4529 float_raise(float_flag_invalid, s); 4530 return 0; 4531 } 4532 return float16_to_int64_round_to_zero(f, s); 4533 } 4534 4535 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4536 { 4537 if (float32_is_any_nan(f)) { 4538 float_raise(float_flag_invalid, s); 4539 return 0; 4540 } 4541 return float32_to_int64_round_to_zero(f, s); 4542 } 4543 4544 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4545 { 4546 if (float64_is_any_nan(f)) { 4547 float_raise(float_flag_invalid, s); 4548 return 0; 4549 } 4550 return float64_to_int64_round_to_zero(f, s); 4551 } 4552 4553 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4554 { 4555 if (float16_is_any_nan(f)) { 4556 float_raise(float_flag_invalid, s); 4557 return 0; 4558 } 4559 return float16_to_uint16_round_to_zero(f, s); 4560 } 4561 4562 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4563 { 4564 if (float16_is_any_nan(f)) { 4565 float_raise(float_flag_invalid, s); 4566 return 0; 4567 } 4568 return float16_to_uint64_round_to_zero(f, s); 4569 } 4570 4571 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4572 { 4573 if (float32_is_any_nan(f)) { 4574 float_raise(float_flag_invalid, s); 4575 return 0; 4576 } 4577 return float32_to_uint64_round_to_zero(f, s); 4578 } 4579 4580 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4581 { 4582 if (float64_is_any_nan(f)) { 4583 float_raise(float_flag_invalid, s); 4584 return 0; 4585 } 4586 return float64_to_uint64_round_to_zero(f, s); 4587 } 4588 4589 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4590 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4591 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4592 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4593 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4594 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4595 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4596 4597 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4598 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4599 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4600 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4601 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4602 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4603 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4604 4605 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4606 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4607 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4608 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4609 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4610 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4611 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4612 4613 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4614 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4615 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4616 4617 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4618 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4619 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4620 4621 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4622 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4623 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4624 4625 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4626 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4627 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4628 4629 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4630 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4631 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4632 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4633 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4634 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4635 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4636 4637 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4638 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4639 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4640 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4641 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4642 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4643 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4644 4645 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4646 { 4647 /* Extract frac to the top of the uint32_t. */ 4648 uint32_t frac = (uint32_t)a << (16 + 6); 4649 int16_t exp = extract32(a, 10, 5); 4650 4651 if (unlikely(exp == 0)) { 4652 if (frac != 0) { 4653 if (!get_flush_inputs_to_zero(s)) { 4654 /* denormal: bias - fractional_zeros */ 4655 return -15 - clz32(frac); 4656 } 4657 /* flush to zero */ 4658 float_raise(float_flag_input_denormal, s); 4659 } 4660 } else if (unlikely(exp == 0x1f)) { 4661 if (frac == 0) { 4662 return INT16_MAX; /* infinity */ 4663 } 4664 } else { 4665 /* normal: exp - bias */ 4666 return exp - 15; 4667 } 4668 /* nan or zero */ 4669 float_raise(float_flag_invalid, s); 4670 return INT16_MIN; 4671 } 4672 4673 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4674 { 4675 /* Extract frac to the top of the uint32_t. */ 4676 uint32_t frac = a << 9; 4677 int32_t exp = extract32(a, 23, 8); 4678 4679 if (unlikely(exp == 0)) { 4680 if (frac != 0) { 4681 if (!get_flush_inputs_to_zero(s)) { 4682 /* denormal: bias - fractional_zeros */ 4683 return -127 - clz32(frac); 4684 } 4685 /* flush to zero */ 4686 float_raise(float_flag_input_denormal, s); 4687 } 4688 } else if (unlikely(exp == 0xff)) { 4689 if (frac == 0) { 4690 return INT32_MAX; /* infinity */ 4691 } 4692 } else { 4693 /* normal: exp - bias */ 4694 return exp - 127; 4695 } 4696 /* nan or zero */ 4697 float_raise(float_flag_invalid, s); 4698 return INT32_MIN; 4699 } 4700 4701 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4702 { 4703 /* Extract frac to the top of the uint64_t. */ 4704 uint64_t frac = a << 12; 4705 int64_t exp = extract64(a, 52, 11); 4706 4707 if (unlikely(exp == 0)) { 4708 if (frac != 0) { 4709 if (!get_flush_inputs_to_zero(s)) { 4710 /* denormal: bias - fractional_zeros */ 4711 return -1023 - clz64(frac); 4712 } 4713 /* flush to zero */ 4714 float_raise(float_flag_input_denormal, s); 4715 } 4716 } else if (unlikely(exp == 0x7ff)) { 4717 if (frac == 0) { 4718 return INT64_MAX; /* infinity */ 4719 } 4720 } else { 4721 /* normal: exp - bias */ 4722 return exp - 1023; 4723 } 4724 /* nan or zero */ 4725 float_raise(float_flag_invalid, s); 4726 return INT64_MIN; 4727 } 4728 4729 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4730 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4731 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4732 4733 #undef DO_ZPZ_FP 4734 4735 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4736 float_status *status, uint32_t desc, 4737 uint16_t neg1, uint16_t neg3) 4738 { 4739 intptr_t i = simd_oprsz(desc); 4740 uint64_t *g = vg; 4741 4742 do { 4743 uint64_t pg = g[(i - 1) >> 6]; 4744 do { 4745 i -= 2; 4746 if (likely((pg >> (i & 63)) & 1)) { 4747 float16 e1, e2, e3, r; 4748 4749 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4750 e2 = *(uint16_t *)(vm + H1_2(i)); 4751 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4752 r = float16_muladd(e1, e2, e3, 0, status); 4753 *(uint16_t *)(vd + H1_2(i)) = r; 4754 } 4755 } while (i & 63); 4756 } while (i != 0); 4757 } 4758 4759 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4760 void *vg, float_status *status, uint32_t desc) 4761 { 4762 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); 4763 } 4764 4765 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4766 void *vg, float_status *status, uint32_t desc) 4767 { 4768 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); 4769 } 4770 4771 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4772 void *vg, float_status *status, uint32_t desc) 4773 { 4774 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); 4775 } 4776 4777 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4778 void *vg, float_status *status, uint32_t desc) 4779 { 4780 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); 4781 } 4782 4783 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4784 float_status *status, uint32_t desc, 4785 uint32_t neg1, uint32_t neg3) 4786 { 4787 intptr_t i = simd_oprsz(desc); 4788 uint64_t *g = vg; 4789 4790 do { 4791 uint64_t pg = g[(i - 1) >> 6]; 4792 do { 4793 i -= 4; 4794 if (likely((pg >> (i & 63)) & 1)) { 4795 float32 e1, e2, e3, r; 4796 4797 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4798 e2 = *(uint32_t *)(vm + H1_4(i)); 4799 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4800 r = float32_muladd(e1, e2, e3, 0, status); 4801 *(uint32_t *)(vd + H1_4(i)) = r; 4802 } 4803 } while (i & 63); 4804 } while (i != 0); 4805 } 4806 4807 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4808 void *vg, float_status *status, uint32_t desc) 4809 { 4810 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); 4811 } 4812 4813 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4814 void *vg, float_status *status, uint32_t desc) 4815 { 4816 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); 4817 } 4818 4819 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4820 void *vg, float_status *status, uint32_t desc) 4821 { 4822 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); 4823 } 4824 4825 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4826 void *vg, float_status *status, uint32_t desc) 4827 { 4828 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); 4829 } 4830 4831 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4832 float_status *status, uint32_t desc, 4833 uint64_t neg1, uint64_t neg3) 4834 { 4835 intptr_t i = simd_oprsz(desc); 4836 uint64_t *g = vg; 4837 4838 do { 4839 uint64_t pg = g[(i - 1) >> 6]; 4840 do { 4841 i -= 8; 4842 if (likely((pg >> (i & 63)) & 1)) { 4843 float64 e1, e2, e3, r; 4844 4845 e1 = *(uint64_t *)(vn + i) ^ neg1; 4846 e2 = *(uint64_t *)(vm + i); 4847 e3 = *(uint64_t *)(va + i) ^ neg3; 4848 r = float64_muladd(e1, e2, e3, 0, status); 4849 *(uint64_t *)(vd + i) = r; 4850 } 4851 } while (i & 63); 4852 } while (i != 0); 4853 } 4854 4855 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4856 void *vg, float_status *status, uint32_t desc) 4857 { 4858 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); 4859 } 4860 4861 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4862 void *vg, float_status *status, uint32_t desc) 4863 { 4864 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); 4865 } 4866 4867 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4868 void *vg, float_status *status, uint32_t desc) 4869 { 4870 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); 4871 } 4872 4873 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4874 void *vg, float_status *status, uint32_t desc) 4875 { 4876 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); 4877 } 4878 4879 /* Two operand floating-point comparison controlled by a predicate. 4880 * Unlike the integer version, we are not allowed to optimistically 4881 * compare operands, since the comparison may have side effects wrt 4882 * the FPSR. 4883 */ 4884 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 4885 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4886 float_status *status, uint32_t desc) \ 4887 { \ 4888 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4889 uint64_t *d = vd, *g = vg; \ 4890 do { \ 4891 uint64_t out = 0, pg = g[j]; \ 4892 do { \ 4893 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4894 if (likely((pg >> (i & 63)) & 1)) { \ 4895 TYPE nn = *(TYPE *)(vn + H(i)); \ 4896 TYPE mm = *(TYPE *)(vm + H(i)); \ 4897 out |= OP(TYPE, nn, mm, status); \ 4898 } \ 4899 } while (i & 63); \ 4900 d[j--] = out; \ 4901 } while (i > 0); \ 4902 } 4903 4904 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 4905 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 4906 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 4907 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 4908 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 4909 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 4910 4911 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 4912 DO_FPCMP_PPZZ_H(NAME, OP) \ 4913 DO_FPCMP_PPZZ_S(NAME, OP) \ 4914 DO_FPCMP_PPZZ_D(NAME, OP) 4915 4916 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 4917 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 4918 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 4919 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 4920 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 4921 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 4922 #define DO_FCMUO(TYPE, X, Y, ST) \ 4923 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 4924 #define DO_FACGE(TYPE, X, Y, ST) \ 4925 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 4926 #define DO_FACGT(TYPE, X, Y, ST) \ 4927 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 4928 4929 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 4930 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 4931 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 4932 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 4933 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 4934 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 4935 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 4936 4937 #undef DO_FPCMP_PPZZ_ALL 4938 #undef DO_FPCMP_PPZZ_D 4939 #undef DO_FPCMP_PPZZ_S 4940 #undef DO_FPCMP_PPZZ_H 4941 #undef DO_FPCMP_PPZZ 4942 4943 /* One operand floating-point comparison against zero, controlled 4944 * by a predicate. 4945 */ 4946 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 4947 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4948 float_status *status, uint32_t desc) \ 4949 { \ 4950 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4951 uint64_t *d = vd, *g = vg; \ 4952 do { \ 4953 uint64_t out = 0, pg = g[j]; \ 4954 do { \ 4955 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4956 if ((pg >> (i & 63)) & 1) { \ 4957 TYPE nn = *(TYPE *)(vn + H(i)); \ 4958 out |= OP(TYPE, nn, 0, status); \ 4959 } \ 4960 } while (i & 63); \ 4961 d[j--] = out; \ 4962 } while (i > 0); \ 4963 } 4964 4965 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 4966 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 4967 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 4968 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 4969 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 4970 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 4971 4972 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 4973 DO_FPCMP_PPZ0_H(NAME, OP) \ 4974 DO_FPCMP_PPZ0_S(NAME, OP) \ 4975 DO_FPCMP_PPZ0_D(NAME, OP) 4976 4977 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 4978 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 4979 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 4980 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 4981 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 4982 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 4983 4984 /* FP Trig Multiply-Add. */ 4985 4986 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 4987 float_status *s, uint32_t desc) 4988 { 4989 static const float16 coeff[16] = { 4990 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4991 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4992 }; 4993 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 4994 intptr_t x = simd_data(desc); 4995 float16 *d = vd, *n = vn, *m = vm; 4996 for (i = 0; i < opr_sz; i++) { 4997 float16 mm = m[i]; 4998 intptr_t xx = x; 4999 if (float16_is_neg(mm)) { 5000 mm = float16_abs(mm); 5001 xx += 8; 5002 } 5003 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, s); 5004 } 5005 } 5006 5007 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5008 float_status *s, uint32_t desc) 5009 { 5010 static const float32 coeff[16] = { 5011 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5012 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5013 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5014 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5015 }; 5016 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5017 intptr_t x = simd_data(desc); 5018 float32 *d = vd, *n = vn, *m = vm; 5019 for (i = 0; i < opr_sz; i++) { 5020 float32 mm = m[i]; 5021 intptr_t xx = x; 5022 if (float32_is_neg(mm)) { 5023 mm = float32_abs(mm); 5024 xx += 8; 5025 } 5026 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, s); 5027 } 5028 } 5029 5030 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5031 float_status *s, uint32_t desc) 5032 { 5033 static const float64 coeff[16] = { 5034 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5035 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5036 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5037 0x3de5d8408868552full, 0x0000000000000000ull, 5038 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5039 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5040 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5041 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5042 }; 5043 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5044 intptr_t x = simd_data(desc); 5045 float64 *d = vd, *n = vn, *m = vm; 5046 for (i = 0; i < opr_sz; i++) { 5047 float64 mm = m[i]; 5048 intptr_t xx = x; 5049 if (float64_is_neg(mm)) { 5050 mm = float64_abs(mm); 5051 xx += 8; 5052 } 5053 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, s); 5054 } 5055 } 5056 5057 /* 5058 * FP Complex Add 5059 */ 5060 5061 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5062 float_status *s, uint32_t desc) 5063 { 5064 intptr_t j, i = simd_oprsz(desc); 5065 uint64_t *g = vg; 5066 float16 neg_imag = float16_set_sign(0, simd_data(desc)); 5067 float16 neg_real = float16_chs(neg_imag); 5068 5069 do { 5070 uint64_t pg = g[(i - 1) >> 6]; 5071 do { 5072 float16 e0, e1, e2, e3; 5073 5074 /* I holds the real index; J holds the imag index. */ 5075 j = i - sizeof(float16); 5076 i -= 2 * sizeof(float16); 5077 5078 e0 = *(float16 *)(vn + H1_2(i)); 5079 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; 5080 e2 = *(float16 *)(vn + H1_2(j)); 5081 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; 5082 5083 if (likely((pg >> (i & 63)) & 1)) { 5084 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5085 } 5086 if (likely((pg >> (j & 63)) & 1)) { 5087 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5088 } 5089 } while (i & 63); 5090 } while (i != 0); 5091 } 5092 5093 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5094 float_status *s, uint32_t desc) 5095 { 5096 intptr_t j, i = simd_oprsz(desc); 5097 uint64_t *g = vg; 5098 float32 neg_imag = float32_set_sign(0, simd_data(desc)); 5099 float32 neg_real = float32_chs(neg_imag); 5100 5101 do { 5102 uint64_t pg = g[(i - 1) >> 6]; 5103 do { 5104 float32 e0, e1, e2, e3; 5105 5106 /* I holds the real index; J holds the imag index. */ 5107 j = i - sizeof(float32); 5108 i -= 2 * sizeof(float32); 5109 5110 e0 = *(float32 *)(vn + H1_2(i)); 5111 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; 5112 e2 = *(float32 *)(vn + H1_2(j)); 5113 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; 5114 5115 if (likely((pg >> (i & 63)) & 1)) { 5116 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5117 } 5118 if (likely((pg >> (j & 63)) & 1)) { 5119 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5120 } 5121 } while (i & 63); 5122 } while (i != 0); 5123 } 5124 5125 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5126 float_status *s, uint32_t desc) 5127 { 5128 intptr_t j, i = simd_oprsz(desc); 5129 uint64_t *g = vg; 5130 float64 neg_imag = float64_set_sign(0, simd_data(desc)); 5131 float64 neg_real = float64_chs(neg_imag); 5132 5133 do { 5134 uint64_t pg = g[(i - 1) >> 6]; 5135 do { 5136 float64 e0, e1, e2, e3; 5137 5138 /* I holds the real index; J holds the imag index. */ 5139 j = i - sizeof(float64); 5140 i -= 2 * sizeof(float64); 5141 5142 e0 = *(float64 *)(vn + H1_2(i)); 5143 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; 5144 e2 = *(float64 *)(vn + H1_2(j)); 5145 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; 5146 5147 if (likely((pg >> (i & 63)) & 1)) { 5148 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5149 } 5150 if (likely((pg >> (j & 63)) & 1)) { 5151 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5152 } 5153 } while (i & 63); 5154 } while (i != 0); 5155 } 5156 5157 /* 5158 * FP Complex Multiply 5159 */ 5160 5161 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5162 void *vg, float_status *status, uint32_t desc) 5163 { 5164 intptr_t j, i = simd_oprsz(desc); 5165 unsigned rot = simd_data(desc); 5166 bool flip = rot & 1; 5167 float16 neg_imag, neg_real; 5168 uint64_t *g = vg; 5169 5170 neg_imag = float16_set_sign(0, (rot & 2) != 0); 5171 neg_real = float16_set_sign(0, rot == 1 || rot == 2); 5172 5173 do { 5174 uint64_t pg = g[(i - 1) >> 6]; 5175 do { 5176 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5177 5178 /* I holds the real index; J holds the imag index. */ 5179 j = i - sizeof(float16); 5180 i -= 2 * sizeof(float16); 5181 5182 nr = *(float16 *)(vn + H1_2(i)); 5183 ni = *(float16 *)(vn + H1_2(j)); 5184 mr = *(float16 *)(vm + H1_2(i)); 5185 mi = *(float16 *)(vm + H1_2(j)); 5186 5187 e2 = (flip ? ni : nr); 5188 e1 = (flip ? mi : mr) ^ neg_real; 5189 e4 = e2; 5190 e3 = (flip ? mr : mi) ^ neg_imag; 5191 5192 if (likely((pg >> (i & 63)) & 1)) { 5193 d = *(float16 *)(va + H1_2(i)); 5194 d = float16_muladd(e2, e1, d, 0, status); 5195 *(float16 *)(vd + H1_2(i)) = d; 5196 } 5197 if (likely((pg >> (j & 63)) & 1)) { 5198 d = *(float16 *)(va + H1_2(j)); 5199 d = float16_muladd(e4, e3, d, 0, status); 5200 *(float16 *)(vd + H1_2(j)) = d; 5201 } 5202 } while (i & 63); 5203 } while (i != 0); 5204 } 5205 5206 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5207 void *vg, float_status *status, uint32_t desc) 5208 { 5209 intptr_t j, i = simd_oprsz(desc); 5210 unsigned rot = simd_data(desc); 5211 bool flip = rot & 1; 5212 float32 neg_imag, neg_real; 5213 uint64_t *g = vg; 5214 5215 neg_imag = float32_set_sign(0, (rot & 2) != 0); 5216 neg_real = float32_set_sign(0, rot == 1 || rot == 2); 5217 5218 do { 5219 uint64_t pg = g[(i - 1) >> 6]; 5220 do { 5221 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5222 5223 /* I holds the real index; J holds the imag index. */ 5224 j = i - sizeof(float32); 5225 i -= 2 * sizeof(float32); 5226 5227 nr = *(float32 *)(vn + H1_2(i)); 5228 ni = *(float32 *)(vn + H1_2(j)); 5229 mr = *(float32 *)(vm + H1_2(i)); 5230 mi = *(float32 *)(vm + H1_2(j)); 5231 5232 e2 = (flip ? ni : nr); 5233 e1 = (flip ? mi : mr) ^ neg_real; 5234 e4 = e2; 5235 e3 = (flip ? mr : mi) ^ neg_imag; 5236 5237 if (likely((pg >> (i & 63)) & 1)) { 5238 d = *(float32 *)(va + H1_2(i)); 5239 d = float32_muladd(e2, e1, d, 0, status); 5240 *(float32 *)(vd + H1_2(i)) = d; 5241 } 5242 if (likely((pg >> (j & 63)) & 1)) { 5243 d = *(float32 *)(va + H1_2(j)); 5244 d = float32_muladd(e4, e3, d, 0, status); 5245 *(float32 *)(vd + H1_2(j)) = d; 5246 } 5247 } while (i & 63); 5248 } while (i != 0); 5249 } 5250 5251 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5252 void *vg, float_status *status, uint32_t desc) 5253 { 5254 intptr_t j, i = simd_oprsz(desc); 5255 unsigned rot = simd_data(desc); 5256 bool flip = rot & 1; 5257 float64 neg_imag, neg_real; 5258 uint64_t *g = vg; 5259 5260 neg_imag = float64_set_sign(0, (rot & 2) != 0); 5261 neg_real = float64_set_sign(0, rot == 1 || rot == 2); 5262 5263 do { 5264 uint64_t pg = g[(i - 1) >> 6]; 5265 do { 5266 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5267 5268 /* I holds the real index; J holds the imag index. */ 5269 j = i - sizeof(float64); 5270 i -= 2 * sizeof(float64); 5271 5272 nr = *(float64 *)(vn + H1_2(i)); 5273 ni = *(float64 *)(vn + H1_2(j)); 5274 mr = *(float64 *)(vm + H1_2(i)); 5275 mi = *(float64 *)(vm + H1_2(j)); 5276 5277 e2 = (flip ? ni : nr); 5278 e1 = (flip ? mi : mr) ^ neg_real; 5279 e4 = e2; 5280 e3 = (flip ? mr : mi) ^ neg_imag; 5281 5282 if (likely((pg >> (i & 63)) & 1)) { 5283 d = *(float64 *)(va + H1_2(i)); 5284 d = float64_muladd(e2, e1, d, 0, status); 5285 *(float64 *)(vd + H1_2(i)) = d; 5286 } 5287 if (likely((pg >> (j & 63)) & 1)) { 5288 d = *(float64 *)(va + H1_2(j)); 5289 d = float64_muladd(e4, e3, d, 0, status); 5290 *(float64 *)(vd + H1_2(j)) = d; 5291 } 5292 } while (i & 63); 5293 } while (i != 0); 5294 } 5295 5296 /* 5297 * Load contiguous data, protected by a governing predicate. 5298 */ 5299 5300 /* 5301 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5302 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5303 * element >= @reg_off, or @reg_max if there were no active elements at all. 5304 */ 5305 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5306 intptr_t reg_max, int esz) 5307 { 5308 uint64_t pg_mask = pred_esz_masks[esz]; 5309 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5310 5311 /* In normal usage, the first element is active. */ 5312 if (likely(pg & 1)) { 5313 return reg_off; 5314 } 5315 5316 if (pg == 0) { 5317 reg_off &= -64; 5318 do { 5319 reg_off += 64; 5320 if (unlikely(reg_off >= reg_max)) { 5321 /* The entire predicate was false. */ 5322 return reg_max; 5323 } 5324 pg = vg[reg_off >> 6] & pg_mask; 5325 } while (pg == 0); 5326 } 5327 reg_off += ctz64(pg); 5328 5329 /* We should never see an out of range predicate bit set. */ 5330 tcg_debug_assert(reg_off < reg_max); 5331 return reg_off; 5332 } 5333 5334 /* 5335 * Resolve the guest virtual address to info->host and info->flags. 5336 * If @nofault, return false if the page is invalid, otherwise 5337 * exit via page fault exception. 5338 */ 5339 5340 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5341 target_ulong addr, int mem_off, MMUAccessType access_type, 5342 int mmu_idx, uintptr_t retaddr) 5343 { 5344 int flags; 5345 5346 addr += mem_off; 5347 5348 /* 5349 * User-only currently always issues with TBI. See the comment 5350 * above useronly_clean_ptr. Usually we clean this top byte away 5351 * during translation, but we can't do that for e.g. vector + imm 5352 * addressing modes. 5353 * 5354 * We currently always enable TBI for user-only, and do not provide 5355 * a way to turn it off. So clean the pointer unconditionally here, 5356 * rather than look it up here, or pass it down from above. 5357 */ 5358 addr = useronly_clean_ptr(addr); 5359 5360 #ifdef CONFIG_USER_ONLY 5361 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5362 &info->host, retaddr); 5363 #else 5364 CPUTLBEntryFull *full; 5365 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5366 &info->host, &full, retaddr); 5367 #endif 5368 info->flags = flags; 5369 5370 if (flags & TLB_INVALID_MASK) { 5371 g_assert(nofault); 5372 return false; 5373 } 5374 5375 #ifdef CONFIG_USER_ONLY 5376 memset(&info->attrs, 0, sizeof(info->attrs)); 5377 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5378 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5379 #else 5380 info->attrs = full->attrs; 5381 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5382 #endif 5383 5384 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5385 info->host -= mem_off; 5386 return true; 5387 } 5388 5389 /* 5390 * Find first active element on each page, and a loose bound for the 5391 * final element on each page. Identify any single element that spans 5392 * the page boundary. Return true if there are any active elements. 5393 */ 5394 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5395 intptr_t reg_max, int esz, int msize) 5396 { 5397 const int esize = 1 << esz; 5398 const uint64_t pg_mask = pred_esz_masks[esz]; 5399 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5400 intptr_t mem_off_last, mem_off_split; 5401 intptr_t page_split, elt_split; 5402 intptr_t i; 5403 5404 /* Set all of the element indices to -1, and the TLB data to 0. */ 5405 memset(info, -1, offsetof(SVEContLdSt, page)); 5406 memset(info->page, 0, sizeof(info->page)); 5407 5408 /* Gross scan over the entire predicate to find bounds. */ 5409 i = 0; 5410 do { 5411 uint64_t pg = vg[i] & pg_mask; 5412 if (pg) { 5413 reg_off_last = i * 64 + 63 - clz64(pg); 5414 if (reg_off_first < 0) { 5415 reg_off_first = i * 64 + ctz64(pg); 5416 } 5417 } 5418 } while (++i * 64 < reg_max); 5419 5420 if (unlikely(reg_off_first < 0)) { 5421 /* No active elements, no pages touched. */ 5422 return false; 5423 } 5424 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5425 5426 info->reg_off_first[0] = reg_off_first; 5427 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5428 mem_off_last = (reg_off_last >> esz) * msize; 5429 5430 page_split = -(addr | TARGET_PAGE_MASK); 5431 if (likely(mem_off_last + msize <= page_split)) { 5432 /* The entire operation fits within a single page. */ 5433 info->reg_off_last[0] = reg_off_last; 5434 return true; 5435 } 5436 5437 info->page_split = page_split; 5438 elt_split = page_split / msize; 5439 reg_off_split = elt_split << esz; 5440 mem_off_split = elt_split * msize; 5441 5442 /* 5443 * This is the last full element on the first page, but it is not 5444 * necessarily active. If there is no full element, i.e. the first 5445 * active element is the one that's split, this value remains -1. 5446 * It is useful as iteration bounds. 5447 */ 5448 if (elt_split != 0) { 5449 info->reg_off_last[0] = reg_off_split - esize; 5450 } 5451 5452 /* Determine if an unaligned element spans the pages. */ 5453 if (page_split % msize != 0) { 5454 /* It is helpful to know if the split element is active. */ 5455 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5456 info->reg_off_split = reg_off_split; 5457 info->mem_off_split = mem_off_split; 5458 5459 if (reg_off_split == reg_off_last) { 5460 /* The page crossing element is last. */ 5461 return true; 5462 } 5463 } 5464 reg_off_split += esize; 5465 mem_off_split += msize; 5466 } 5467 5468 /* 5469 * We do want the first active element on the second page, because 5470 * this may affect the address reported in an exception. 5471 */ 5472 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5473 tcg_debug_assert(reg_off_split <= reg_off_last); 5474 info->reg_off_first[1] = reg_off_split; 5475 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5476 info->reg_off_last[1] = reg_off_last; 5477 return true; 5478 } 5479 5480 /* 5481 * Resolve the guest virtual addresses to info->page[]. 5482 * Control the generation of page faults with @fault. Return false if 5483 * there is no work to do, which can only happen with @fault == FAULT_NO. 5484 */ 5485 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5486 CPUARMState *env, target_ulong addr, 5487 MMUAccessType access_type, uintptr_t retaddr) 5488 { 5489 int mmu_idx = arm_env_mmu_index(env); 5490 int mem_off = info->mem_off_first[0]; 5491 bool nofault = fault == FAULT_NO; 5492 bool have_work = true; 5493 5494 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5495 access_type, mmu_idx, retaddr)) { 5496 /* No work to be done. */ 5497 return false; 5498 } 5499 5500 if (likely(info->page_split < 0)) { 5501 /* The entire operation was on the one page. */ 5502 return true; 5503 } 5504 5505 /* 5506 * If the second page is invalid, then we want the fault address to be 5507 * the first byte on that page which is accessed. 5508 */ 5509 if (info->mem_off_split >= 0) { 5510 /* 5511 * There is an element split across the pages. The fault address 5512 * should be the first byte of the second page. 5513 */ 5514 mem_off = info->page_split; 5515 /* 5516 * If the split element is also the first active element 5517 * of the vector, then: For first-fault we should continue 5518 * to generate faults for the second page. For no-fault, 5519 * we have work only if the second page is valid. 5520 */ 5521 if (info->mem_off_first[0] < info->mem_off_split) { 5522 nofault = FAULT_FIRST; 5523 have_work = false; 5524 } 5525 } else { 5526 /* 5527 * There is no element split across the pages. The fault address 5528 * should be the first active element on the second page. 5529 */ 5530 mem_off = info->mem_off_first[1]; 5531 /* 5532 * There must have been one active element on the first page, 5533 * so we're out of first-fault territory. 5534 */ 5535 nofault = fault != FAULT_ALL; 5536 } 5537 5538 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5539 access_type, mmu_idx, retaddr); 5540 return have_work; 5541 } 5542 5543 #ifndef CONFIG_USER_ONLY 5544 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5545 uint64_t *vg, target_ulong addr, 5546 int esize, int msize, int wp_access, 5547 uintptr_t retaddr) 5548 { 5549 intptr_t mem_off, reg_off, reg_last; 5550 int flags0 = info->page[0].flags; 5551 int flags1 = info->page[1].flags; 5552 5553 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5554 return; 5555 } 5556 5557 /* Indicate that watchpoints are handled. */ 5558 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5559 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5560 5561 if (flags0 & TLB_WATCHPOINT) { 5562 mem_off = info->mem_off_first[0]; 5563 reg_off = info->reg_off_first[0]; 5564 reg_last = info->reg_off_last[0]; 5565 5566 while (reg_off <= reg_last) { 5567 uint64_t pg = vg[reg_off >> 6]; 5568 do { 5569 if ((pg >> (reg_off & 63)) & 1) { 5570 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5571 msize, info->page[0].attrs, 5572 wp_access, retaddr); 5573 } 5574 reg_off += esize; 5575 mem_off += msize; 5576 } while (reg_off <= reg_last && (reg_off & 63)); 5577 } 5578 } 5579 5580 mem_off = info->mem_off_split; 5581 if (mem_off >= 0) { 5582 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5583 info->page[0].attrs, wp_access, retaddr); 5584 } 5585 5586 mem_off = info->mem_off_first[1]; 5587 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5588 reg_off = info->reg_off_first[1]; 5589 reg_last = info->reg_off_last[1]; 5590 5591 do { 5592 uint64_t pg = vg[reg_off >> 6]; 5593 do { 5594 if ((pg >> (reg_off & 63)) & 1) { 5595 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5596 msize, info->page[1].attrs, 5597 wp_access, retaddr); 5598 } 5599 reg_off += esize; 5600 mem_off += msize; 5601 } while (reg_off & 63); 5602 } while (reg_off <= reg_last); 5603 } 5604 } 5605 #endif 5606 5607 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5608 uint64_t *vg, target_ulong addr, int esize, 5609 int msize, uint32_t mtedesc, uintptr_t ra) 5610 { 5611 intptr_t mem_off, reg_off, reg_last; 5612 5613 /* Process the page only if MemAttr == Tagged. */ 5614 if (info->page[0].tagged) { 5615 mem_off = info->mem_off_first[0]; 5616 reg_off = info->reg_off_first[0]; 5617 reg_last = info->reg_off_split; 5618 if (reg_last < 0) { 5619 reg_last = info->reg_off_last[0]; 5620 } 5621 5622 do { 5623 uint64_t pg = vg[reg_off >> 6]; 5624 do { 5625 if ((pg >> (reg_off & 63)) & 1) { 5626 mte_check(env, mtedesc, addr, ra); 5627 } 5628 reg_off += esize; 5629 mem_off += msize; 5630 } while (reg_off <= reg_last && (reg_off & 63)); 5631 } while (reg_off <= reg_last); 5632 } 5633 5634 mem_off = info->mem_off_first[1]; 5635 if (mem_off >= 0 && info->page[1].tagged) { 5636 reg_off = info->reg_off_first[1]; 5637 reg_last = info->reg_off_last[1]; 5638 5639 do { 5640 uint64_t pg = vg[reg_off >> 6]; 5641 do { 5642 if ((pg >> (reg_off & 63)) & 1) { 5643 mte_check(env, mtedesc, addr, ra); 5644 } 5645 reg_off += esize; 5646 mem_off += msize; 5647 } while (reg_off & 63); 5648 } while (reg_off <= reg_last); 5649 } 5650 } 5651 5652 /* 5653 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5654 */ 5655 static inline QEMU_ALWAYS_INLINE 5656 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5657 uint32_t desc, const uintptr_t retaddr, 5658 const int esz, const int msz, const int N, uint32_t mtedesc, 5659 sve_ldst1_host_fn *host_fn, 5660 sve_ldst1_tlb_fn *tlb_fn) 5661 { 5662 const unsigned rd = simd_data(desc); 5663 const intptr_t reg_max = simd_oprsz(desc); 5664 intptr_t reg_off, reg_last, mem_off; 5665 SVEContLdSt info; 5666 void *host; 5667 int flags, i; 5668 5669 /* Find the active elements. */ 5670 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5671 /* The entire predicate was false; no load occurs. */ 5672 for (i = 0; i < N; ++i) { 5673 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5674 } 5675 return; 5676 } 5677 5678 /* Probe the page(s). Exit with exception for any invalid page. */ 5679 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5680 5681 /* Handle watchpoints for all active elements. */ 5682 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5683 BP_MEM_READ, retaddr); 5684 5685 /* 5686 * Handle mte checks for all active elements. 5687 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5688 */ 5689 if (mtedesc) { 5690 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5691 mtedesc, retaddr); 5692 } 5693 5694 flags = info.page[0].flags | info.page[1].flags; 5695 if (unlikely(flags != 0)) { 5696 /* 5697 * At least one page includes MMIO. 5698 * Any bus operation can fail with cpu_transaction_failed, 5699 * which for ARM will raise SyncExternal. Perform the load 5700 * into scratch memory to preserve register state until the end. 5701 */ 5702 ARMVectorReg scratch[4] = { }; 5703 5704 mem_off = info.mem_off_first[0]; 5705 reg_off = info.reg_off_first[0]; 5706 reg_last = info.reg_off_last[1]; 5707 if (reg_last < 0) { 5708 reg_last = info.reg_off_split; 5709 if (reg_last < 0) { 5710 reg_last = info.reg_off_last[0]; 5711 } 5712 } 5713 5714 do { 5715 uint64_t pg = vg[reg_off >> 6]; 5716 do { 5717 if ((pg >> (reg_off & 63)) & 1) { 5718 for (i = 0; i < N; ++i) { 5719 tlb_fn(env, &scratch[i], reg_off, 5720 addr + mem_off + (i << msz), retaddr); 5721 } 5722 } 5723 reg_off += 1 << esz; 5724 mem_off += N << msz; 5725 } while (reg_off & 63); 5726 } while (reg_off <= reg_last); 5727 5728 for (i = 0; i < N; ++i) { 5729 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5730 } 5731 return; 5732 } 5733 5734 /* The entire operation is in RAM, on valid pages. */ 5735 5736 for (i = 0; i < N; ++i) { 5737 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5738 } 5739 5740 mem_off = info.mem_off_first[0]; 5741 reg_off = info.reg_off_first[0]; 5742 reg_last = info.reg_off_last[0]; 5743 host = info.page[0].host; 5744 5745 set_helper_retaddr(retaddr); 5746 5747 while (reg_off <= reg_last) { 5748 uint64_t pg = vg[reg_off >> 6]; 5749 do { 5750 if ((pg >> (reg_off & 63)) & 1) { 5751 for (i = 0; i < N; ++i) { 5752 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5753 host + mem_off + (i << msz)); 5754 } 5755 } 5756 reg_off += 1 << esz; 5757 mem_off += N << msz; 5758 } while (reg_off <= reg_last && (reg_off & 63)); 5759 } 5760 5761 clear_helper_retaddr(); 5762 5763 /* 5764 * Use the slow path to manage the cross-page misalignment. 5765 * But we know this is RAM and cannot trap. 5766 */ 5767 mem_off = info.mem_off_split; 5768 if (unlikely(mem_off >= 0)) { 5769 reg_off = info.reg_off_split; 5770 for (i = 0; i < N; ++i) { 5771 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5772 addr + mem_off + (i << msz), retaddr); 5773 } 5774 } 5775 5776 mem_off = info.mem_off_first[1]; 5777 if (unlikely(mem_off >= 0)) { 5778 reg_off = info.reg_off_first[1]; 5779 reg_last = info.reg_off_last[1]; 5780 host = info.page[1].host; 5781 5782 set_helper_retaddr(retaddr); 5783 5784 do { 5785 uint64_t pg = vg[reg_off >> 6]; 5786 do { 5787 if ((pg >> (reg_off & 63)) & 1) { 5788 for (i = 0; i < N; ++i) { 5789 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5790 host + mem_off + (i << msz)); 5791 } 5792 } 5793 reg_off += 1 << esz; 5794 mem_off += N << msz; 5795 } while (reg_off & 63); 5796 } while (reg_off <= reg_last); 5797 5798 clear_helper_retaddr(); 5799 } 5800 } 5801 5802 static inline QEMU_ALWAYS_INLINE 5803 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 5804 uint32_t desc, const uintptr_t ra, 5805 const int esz, const int msz, const int N, 5806 sve_ldst1_host_fn *host_fn, 5807 sve_ldst1_tlb_fn *tlb_fn) 5808 { 5809 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5810 int bit55 = extract64(addr, 55, 1); 5811 5812 /* Remove mtedesc from the normal sve descriptor. */ 5813 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5814 5815 /* Perform gross MTE suppression early. */ 5816 if (!tbi_check(mtedesc, bit55) || 5817 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 5818 mtedesc = 0; 5819 } 5820 5821 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 5822 } 5823 5824 #define DO_LD1_1(NAME, ESZ) \ 5825 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 5826 target_ulong addr, uint32_t desc) \ 5827 { \ 5828 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 5829 sve_##NAME##_host, sve_##NAME##_tlb); \ 5830 } \ 5831 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 5832 target_ulong addr, uint32_t desc) \ 5833 { \ 5834 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 5835 sve_##NAME##_host, sve_##NAME##_tlb); \ 5836 } 5837 5838 #define DO_LD1_2(NAME, ESZ, MSZ) \ 5839 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 5840 target_ulong addr, uint32_t desc) \ 5841 { \ 5842 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5843 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5844 } \ 5845 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 5846 target_ulong addr, uint32_t desc) \ 5847 { \ 5848 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5849 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5850 } \ 5851 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 5852 target_ulong addr, uint32_t desc) \ 5853 { \ 5854 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5855 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5856 } \ 5857 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 5858 target_ulong addr, uint32_t desc) \ 5859 { \ 5860 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5861 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5862 } 5863 5864 DO_LD1_1(ld1bb, MO_8) 5865 DO_LD1_1(ld1bhu, MO_16) 5866 DO_LD1_1(ld1bhs, MO_16) 5867 DO_LD1_1(ld1bsu, MO_32) 5868 DO_LD1_1(ld1bss, MO_32) 5869 DO_LD1_1(ld1bdu, MO_64) 5870 DO_LD1_1(ld1bds, MO_64) 5871 5872 DO_LD1_2(ld1hh, MO_16, MO_16) 5873 DO_LD1_2(ld1hsu, MO_32, MO_16) 5874 DO_LD1_2(ld1hss, MO_32, MO_16) 5875 DO_LD1_2(ld1hdu, MO_64, MO_16) 5876 DO_LD1_2(ld1hds, MO_64, MO_16) 5877 5878 DO_LD1_2(ld1ss, MO_32, MO_32) 5879 DO_LD1_2(ld1sdu, MO_64, MO_32) 5880 DO_LD1_2(ld1sds, MO_64, MO_32) 5881 5882 DO_LD1_2(ld1dd, MO_64, MO_64) 5883 5884 #undef DO_LD1_1 5885 #undef DO_LD1_2 5886 5887 #define DO_LDN_1(N) \ 5888 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 5889 target_ulong addr, uint32_t desc) \ 5890 { \ 5891 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 5892 sve_ld1bb_host, sve_ld1bb_tlb); \ 5893 } \ 5894 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 5895 target_ulong addr, uint32_t desc) \ 5896 { \ 5897 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 5898 sve_ld1bb_host, sve_ld1bb_tlb); \ 5899 } 5900 5901 #define DO_LDN_2(N, SUFF, ESZ) \ 5902 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 5903 target_ulong addr, uint32_t desc) \ 5904 { \ 5905 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5906 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5907 } \ 5908 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 5909 target_ulong addr, uint32_t desc) \ 5910 { \ 5911 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5912 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5913 } \ 5914 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 5915 target_ulong addr, uint32_t desc) \ 5916 { \ 5917 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5918 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5919 } \ 5920 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 5921 target_ulong addr, uint32_t desc) \ 5922 { \ 5923 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5924 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5925 } 5926 5927 DO_LDN_1(2) 5928 DO_LDN_1(3) 5929 DO_LDN_1(4) 5930 5931 DO_LDN_2(2, hh, MO_16) 5932 DO_LDN_2(3, hh, MO_16) 5933 DO_LDN_2(4, hh, MO_16) 5934 5935 DO_LDN_2(2, ss, MO_32) 5936 DO_LDN_2(3, ss, MO_32) 5937 DO_LDN_2(4, ss, MO_32) 5938 5939 DO_LDN_2(2, dd, MO_64) 5940 DO_LDN_2(3, dd, MO_64) 5941 DO_LDN_2(4, dd, MO_64) 5942 5943 #undef DO_LDN_1 5944 #undef DO_LDN_2 5945 5946 /* 5947 * Load contiguous data, first-fault and no-fault. 5948 * 5949 * For user-only, we control the race between page_check_range and 5950 * another thread's munmap by using set/clear_helper_retaddr. Any 5951 * SEGV that occurs between those markers is assumed to be because 5952 * the guest page vanished. Keep that block as small as possible 5953 * so that unrelated QEMU bugs are not blamed on the guest. 5954 */ 5955 5956 /* Fault on byte I. All bits in FFR from I are cleared. The vector 5957 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 5958 * option, which leaves subsequent data unchanged. 5959 */ 5960 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 5961 { 5962 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 5963 5964 if (i & 63) { 5965 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 5966 i = ROUND_UP(i, 64); 5967 } 5968 for (; i < oprsz; i += 64) { 5969 ffr[i / 64] = 0; 5970 } 5971 } 5972 5973 /* 5974 * Common helper for all contiguous no-fault and first-fault loads. 5975 */ 5976 static inline QEMU_ALWAYS_INLINE 5977 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 5978 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 5979 const int esz, const int msz, const SVEContFault fault, 5980 sve_ldst1_host_fn *host_fn, 5981 sve_ldst1_tlb_fn *tlb_fn) 5982 { 5983 const unsigned rd = simd_data(desc); 5984 void *vd = &env->vfp.zregs[rd]; 5985 const intptr_t reg_max = simd_oprsz(desc); 5986 intptr_t reg_off, mem_off, reg_last; 5987 SVEContLdSt info; 5988 int flags; 5989 void *host; 5990 5991 /* Find the active elements. */ 5992 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 5993 /* The entire predicate was false; no load occurs. */ 5994 memset(vd, 0, reg_max); 5995 return; 5996 } 5997 reg_off = info.reg_off_first[0]; 5998 5999 /* Probe the page(s). */ 6000 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6001 /* Fault on first element. */ 6002 tcg_debug_assert(fault == FAULT_NO); 6003 memset(vd, 0, reg_max); 6004 goto do_fault; 6005 } 6006 6007 mem_off = info.mem_off_first[0]; 6008 flags = info.page[0].flags; 6009 6010 /* 6011 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6012 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6013 */ 6014 if (!info.page[0].tagged) { 6015 mtedesc = 0; 6016 } 6017 6018 if (fault == FAULT_FIRST) { 6019 /* Trapping mte check for the first-fault element. */ 6020 if (mtedesc) { 6021 mte_check(env, mtedesc, addr + mem_off, retaddr); 6022 } 6023 6024 /* 6025 * Special handling of the first active element, 6026 * if it crosses a page boundary or is MMIO. 6027 */ 6028 bool is_split = mem_off == info.mem_off_split; 6029 if (unlikely(flags != 0) || unlikely(is_split)) { 6030 /* 6031 * Use the slow path for cross-page handling. 6032 * Might trap for MMIO or watchpoints. 6033 */ 6034 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6035 6036 /* After any fault, zero the other elements. */ 6037 swap_memzero(vd, reg_off); 6038 reg_off += 1 << esz; 6039 mem_off += 1 << msz; 6040 swap_memzero(vd + reg_off, reg_max - reg_off); 6041 6042 if (is_split) { 6043 goto second_page; 6044 } 6045 } else { 6046 memset(vd, 0, reg_max); 6047 } 6048 } else { 6049 memset(vd, 0, reg_max); 6050 if (unlikely(mem_off == info.mem_off_split)) { 6051 /* The first active element crosses a page boundary. */ 6052 flags |= info.page[1].flags; 6053 if (unlikely(flags & TLB_MMIO)) { 6054 /* Some page is MMIO, see below. */ 6055 goto do_fault; 6056 } 6057 if (unlikely(flags & TLB_WATCHPOINT) && 6058 (cpu_watchpoint_address_matches 6059 (env_cpu(env), addr + mem_off, 1 << msz) 6060 & BP_MEM_READ)) { 6061 /* Watchpoint hit, see below. */ 6062 goto do_fault; 6063 } 6064 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6065 goto do_fault; 6066 } 6067 /* 6068 * Use the slow path for cross-page handling. 6069 * This is RAM, without a watchpoint, and will not trap. 6070 */ 6071 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6072 goto second_page; 6073 } 6074 } 6075 6076 /* 6077 * From this point on, all memory operations are MemSingleNF. 6078 * 6079 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6080 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6081 * 6082 * Unfortuately we do not have access to the memory attributes from the 6083 * PTE to tell Device memory from Normal memory. So we make a mostly 6084 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6085 * This gives the right answer for the common cases of "Normal memory, 6086 * backed by host RAM" and "Device memory, backed by MMIO". 6087 * The architecture allows us to suppress an NF load and return 6088 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6089 * case of "Normal memory, backed by MMIO" is permitted. The case we 6090 * get wrong is "Device memory, backed by host RAM", for which we 6091 * should return (UNKNOWN, FAULT) for but do not. 6092 * 6093 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6094 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6095 * architectural breakpoints the same. 6096 */ 6097 if (unlikely(flags & TLB_MMIO)) { 6098 goto do_fault; 6099 } 6100 6101 reg_last = info.reg_off_last[0]; 6102 host = info.page[0].host; 6103 6104 set_helper_retaddr(retaddr); 6105 6106 do { 6107 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6108 do { 6109 if ((pg >> (reg_off & 63)) & 1) { 6110 if (unlikely(flags & TLB_WATCHPOINT) && 6111 (cpu_watchpoint_address_matches 6112 (env_cpu(env), addr + mem_off, 1 << msz) 6113 & BP_MEM_READ)) { 6114 clear_helper_retaddr(); 6115 goto do_fault; 6116 } 6117 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6118 clear_helper_retaddr(); 6119 goto do_fault; 6120 } 6121 host_fn(vd, reg_off, host + mem_off); 6122 } 6123 reg_off += 1 << esz; 6124 mem_off += 1 << msz; 6125 } while (reg_off <= reg_last && (reg_off & 63)); 6126 } while (reg_off <= reg_last); 6127 6128 clear_helper_retaddr(); 6129 6130 /* 6131 * MemSingleNF is allowed to fail for any reason. We have special 6132 * code above to handle the first element crossing a page boundary. 6133 * As an implementation choice, decline to handle a cross-page element 6134 * in any other position. 6135 */ 6136 reg_off = info.reg_off_split; 6137 if (reg_off >= 0) { 6138 goto do_fault; 6139 } 6140 6141 second_page: 6142 reg_off = info.reg_off_first[1]; 6143 if (likely(reg_off < 0)) { 6144 /* No active elements on the second page. All done. */ 6145 return; 6146 } 6147 6148 /* 6149 * MemSingleNF is allowed to fail for any reason. As an implementation 6150 * choice, decline to handle elements on the second page. This should 6151 * be low frequency as the guest walks through memory -- the next 6152 * iteration of the guest's loop should be aligned on the page boundary, 6153 * and then all following iterations will stay aligned. 6154 */ 6155 6156 do_fault: 6157 record_fault(env, reg_off, reg_max); 6158 } 6159 6160 static inline QEMU_ALWAYS_INLINE 6161 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6162 uint32_t desc, const uintptr_t retaddr, 6163 const int esz, const int msz, const SVEContFault fault, 6164 sve_ldst1_host_fn *host_fn, 6165 sve_ldst1_tlb_fn *tlb_fn) 6166 { 6167 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6168 int bit55 = extract64(addr, 55, 1); 6169 6170 /* Remove mtedesc from the normal sve descriptor. */ 6171 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6172 6173 /* Perform gross MTE suppression early. */ 6174 if (!tbi_check(mtedesc, bit55) || 6175 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6176 mtedesc = 0; 6177 } 6178 6179 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6180 esz, msz, fault, host_fn, tlb_fn); 6181 } 6182 6183 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6184 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6185 target_ulong addr, uint32_t desc) \ 6186 { \ 6187 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6188 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6189 } \ 6190 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6191 target_ulong addr, uint32_t desc) \ 6192 { \ 6193 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6194 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6195 } \ 6196 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6197 target_ulong addr, uint32_t desc) \ 6198 { \ 6199 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6200 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6201 } \ 6202 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6203 target_ulong addr, uint32_t desc) \ 6204 { \ 6205 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6206 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6207 } 6208 6209 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6210 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6211 target_ulong addr, uint32_t desc) \ 6212 { \ 6213 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6214 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6215 } \ 6216 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6217 target_ulong addr, uint32_t desc) \ 6218 { \ 6219 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6220 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6221 } \ 6222 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6223 target_ulong addr, uint32_t desc) \ 6224 { \ 6225 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6226 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6227 } \ 6228 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6229 target_ulong addr, uint32_t desc) \ 6230 { \ 6231 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6232 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6233 } \ 6234 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6235 target_ulong addr, uint32_t desc) \ 6236 { \ 6237 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6238 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6239 } \ 6240 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6241 target_ulong addr, uint32_t desc) \ 6242 { \ 6243 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6244 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6245 } \ 6246 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6247 target_ulong addr, uint32_t desc) \ 6248 { \ 6249 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6250 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6251 } \ 6252 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6253 target_ulong addr, uint32_t desc) \ 6254 { \ 6255 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6256 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6257 } 6258 6259 DO_LDFF1_LDNF1_1(bb, MO_8) 6260 DO_LDFF1_LDNF1_1(bhu, MO_16) 6261 DO_LDFF1_LDNF1_1(bhs, MO_16) 6262 DO_LDFF1_LDNF1_1(bsu, MO_32) 6263 DO_LDFF1_LDNF1_1(bss, MO_32) 6264 DO_LDFF1_LDNF1_1(bdu, MO_64) 6265 DO_LDFF1_LDNF1_1(bds, MO_64) 6266 6267 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6268 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6269 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6270 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6271 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6272 6273 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6274 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6275 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6276 6277 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6278 6279 #undef DO_LDFF1_LDNF1_1 6280 #undef DO_LDFF1_LDNF1_2 6281 6282 /* 6283 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6284 */ 6285 6286 static inline QEMU_ALWAYS_INLINE 6287 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6288 uint32_t desc, const uintptr_t retaddr, 6289 const int esz, const int msz, const int N, uint32_t mtedesc, 6290 sve_ldst1_host_fn *host_fn, 6291 sve_ldst1_tlb_fn *tlb_fn) 6292 { 6293 const unsigned rd = simd_data(desc); 6294 const intptr_t reg_max = simd_oprsz(desc); 6295 intptr_t reg_off, reg_last, mem_off; 6296 SVEContLdSt info; 6297 void *host; 6298 int i, flags; 6299 6300 /* Find the active elements. */ 6301 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6302 /* The entire predicate was false; no store occurs. */ 6303 return; 6304 } 6305 6306 /* Probe the page(s). Exit with exception for any invalid page. */ 6307 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6308 6309 /* Handle watchpoints for all active elements. */ 6310 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6311 BP_MEM_WRITE, retaddr); 6312 6313 /* 6314 * Handle mte checks for all active elements. 6315 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6316 */ 6317 if (mtedesc) { 6318 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6319 mtedesc, retaddr); 6320 } 6321 6322 flags = info.page[0].flags | info.page[1].flags; 6323 if (unlikely(flags != 0)) { 6324 /* 6325 * At least one page includes MMIO. 6326 * Any bus operation can fail with cpu_transaction_failed, 6327 * which for ARM will raise SyncExternal. We cannot avoid 6328 * this fault and will leave with the store incomplete. 6329 */ 6330 mem_off = info.mem_off_first[0]; 6331 reg_off = info.reg_off_first[0]; 6332 reg_last = info.reg_off_last[1]; 6333 if (reg_last < 0) { 6334 reg_last = info.reg_off_split; 6335 if (reg_last < 0) { 6336 reg_last = info.reg_off_last[0]; 6337 } 6338 } 6339 6340 do { 6341 uint64_t pg = vg[reg_off >> 6]; 6342 do { 6343 if ((pg >> (reg_off & 63)) & 1) { 6344 for (i = 0; i < N; ++i) { 6345 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6346 addr + mem_off + (i << msz), retaddr); 6347 } 6348 } 6349 reg_off += 1 << esz; 6350 mem_off += N << msz; 6351 } while (reg_off & 63); 6352 } while (reg_off <= reg_last); 6353 return; 6354 } 6355 6356 mem_off = info.mem_off_first[0]; 6357 reg_off = info.reg_off_first[0]; 6358 reg_last = info.reg_off_last[0]; 6359 host = info.page[0].host; 6360 6361 set_helper_retaddr(retaddr); 6362 6363 while (reg_off <= reg_last) { 6364 uint64_t pg = vg[reg_off >> 6]; 6365 do { 6366 if ((pg >> (reg_off & 63)) & 1) { 6367 for (i = 0; i < N; ++i) { 6368 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6369 host + mem_off + (i << msz)); 6370 } 6371 } 6372 reg_off += 1 << esz; 6373 mem_off += N << msz; 6374 } while (reg_off <= reg_last && (reg_off & 63)); 6375 } 6376 6377 clear_helper_retaddr(); 6378 6379 /* 6380 * Use the slow path to manage the cross-page misalignment. 6381 * But we know this is RAM and cannot trap. 6382 */ 6383 mem_off = info.mem_off_split; 6384 if (unlikely(mem_off >= 0)) { 6385 reg_off = info.reg_off_split; 6386 for (i = 0; i < N; ++i) { 6387 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6388 addr + mem_off + (i << msz), retaddr); 6389 } 6390 } 6391 6392 mem_off = info.mem_off_first[1]; 6393 if (unlikely(mem_off >= 0)) { 6394 reg_off = info.reg_off_first[1]; 6395 reg_last = info.reg_off_last[1]; 6396 host = info.page[1].host; 6397 6398 set_helper_retaddr(retaddr); 6399 6400 do { 6401 uint64_t pg = vg[reg_off >> 6]; 6402 do { 6403 if ((pg >> (reg_off & 63)) & 1) { 6404 for (i = 0; i < N; ++i) { 6405 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6406 host + mem_off + (i << msz)); 6407 } 6408 } 6409 reg_off += 1 << esz; 6410 mem_off += N << msz; 6411 } while (reg_off & 63); 6412 } while (reg_off <= reg_last); 6413 6414 clear_helper_retaddr(); 6415 } 6416 } 6417 6418 static inline QEMU_ALWAYS_INLINE 6419 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6420 uint32_t desc, const uintptr_t ra, 6421 const int esz, const int msz, const int N, 6422 sve_ldst1_host_fn *host_fn, 6423 sve_ldst1_tlb_fn *tlb_fn) 6424 { 6425 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6426 int bit55 = extract64(addr, 55, 1); 6427 6428 /* Remove mtedesc from the normal sve descriptor. */ 6429 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6430 6431 /* Perform gross MTE suppression early. */ 6432 if (!tbi_check(mtedesc, bit55) || 6433 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6434 mtedesc = 0; 6435 } 6436 6437 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6438 } 6439 6440 #define DO_STN_1(N, NAME, ESZ) \ 6441 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6442 target_ulong addr, uint32_t desc) \ 6443 { \ 6444 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6445 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6446 } \ 6447 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6448 target_ulong addr, uint32_t desc) \ 6449 { \ 6450 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6451 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6452 } 6453 6454 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6455 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6456 target_ulong addr, uint32_t desc) \ 6457 { \ 6458 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6459 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6460 } \ 6461 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6462 target_ulong addr, uint32_t desc) \ 6463 { \ 6464 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6465 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6466 } \ 6467 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6468 target_ulong addr, uint32_t desc) \ 6469 { \ 6470 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6471 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6472 } \ 6473 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6474 target_ulong addr, uint32_t desc) \ 6475 { \ 6476 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6477 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6478 } 6479 6480 DO_STN_1(1, bb, MO_8) 6481 DO_STN_1(1, bh, MO_16) 6482 DO_STN_1(1, bs, MO_32) 6483 DO_STN_1(1, bd, MO_64) 6484 DO_STN_1(2, bb, MO_8) 6485 DO_STN_1(3, bb, MO_8) 6486 DO_STN_1(4, bb, MO_8) 6487 6488 DO_STN_2(1, hh, MO_16, MO_16) 6489 DO_STN_2(1, hs, MO_32, MO_16) 6490 DO_STN_2(1, hd, MO_64, MO_16) 6491 DO_STN_2(2, hh, MO_16, MO_16) 6492 DO_STN_2(3, hh, MO_16, MO_16) 6493 DO_STN_2(4, hh, MO_16, MO_16) 6494 6495 DO_STN_2(1, ss, MO_32, MO_32) 6496 DO_STN_2(1, sd, MO_64, MO_32) 6497 DO_STN_2(2, ss, MO_32, MO_32) 6498 DO_STN_2(3, ss, MO_32, MO_32) 6499 DO_STN_2(4, ss, MO_32, MO_32) 6500 6501 DO_STN_2(1, dd, MO_64, MO_64) 6502 DO_STN_2(2, dd, MO_64, MO_64) 6503 DO_STN_2(3, dd, MO_64, MO_64) 6504 DO_STN_2(4, dd, MO_64, MO_64) 6505 6506 #undef DO_STN_1 6507 #undef DO_STN_2 6508 6509 /* 6510 * Loads with a vector index. 6511 */ 6512 6513 /* 6514 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6515 */ 6516 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6517 6518 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6519 { 6520 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6521 } 6522 6523 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6524 { 6525 return *(int32_t *)(reg + H1_4(reg_ofs)); 6526 } 6527 6528 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6529 { 6530 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6531 } 6532 6533 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6534 { 6535 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6536 } 6537 6538 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6539 { 6540 return *(uint64_t *)(reg + reg_ofs); 6541 } 6542 6543 static inline QEMU_ALWAYS_INLINE 6544 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6545 target_ulong base, uint32_t desc, uintptr_t retaddr, 6546 uint32_t mtedesc, int esize, int msize, 6547 zreg_off_fn *off_fn, 6548 sve_ldst1_host_fn *host_fn, 6549 sve_ldst1_tlb_fn *tlb_fn) 6550 { 6551 const int mmu_idx = arm_env_mmu_index(env); 6552 const intptr_t reg_max = simd_oprsz(desc); 6553 const int scale = simd_data(desc); 6554 ARMVectorReg scratch; 6555 intptr_t reg_off; 6556 SVEHostPage info, info2; 6557 6558 memset(&scratch, 0, reg_max); 6559 reg_off = 0; 6560 do { 6561 uint64_t pg = vg[reg_off >> 6]; 6562 do { 6563 if (likely(pg & 1)) { 6564 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6565 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6566 6567 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6568 mmu_idx, retaddr); 6569 6570 if (likely(in_page >= msize)) { 6571 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6572 cpu_check_watchpoint(env_cpu(env), addr, msize, 6573 info.attrs, BP_MEM_READ, retaddr); 6574 } 6575 if (mtedesc && info.tagged) { 6576 mte_check(env, mtedesc, addr, retaddr); 6577 } 6578 if (unlikely(info.flags & TLB_MMIO)) { 6579 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6580 } else { 6581 set_helper_retaddr(retaddr); 6582 host_fn(&scratch, reg_off, info.host); 6583 clear_helper_retaddr(); 6584 } 6585 } else { 6586 /* Element crosses the page boundary. */ 6587 sve_probe_page(&info2, false, env, addr + in_page, 0, 6588 MMU_DATA_LOAD, mmu_idx, retaddr); 6589 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6590 cpu_check_watchpoint(env_cpu(env), addr, 6591 msize, info.attrs, 6592 BP_MEM_READ, retaddr); 6593 } 6594 if (mtedesc && info.tagged) { 6595 mte_check(env, mtedesc, addr, retaddr); 6596 } 6597 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6598 } 6599 } 6600 reg_off += esize; 6601 pg >>= esize; 6602 } while (reg_off & 63); 6603 } while (reg_off < reg_max); 6604 6605 /* Wait until all exceptions have been raised to write back. */ 6606 memcpy(vd, &scratch, reg_max); 6607 } 6608 6609 static inline QEMU_ALWAYS_INLINE 6610 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6611 target_ulong base, uint32_t desc, uintptr_t retaddr, 6612 int esize, int msize, zreg_off_fn *off_fn, 6613 sve_ldst1_host_fn *host_fn, 6614 sve_ldst1_tlb_fn *tlb_fn) 6615 { 6616 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6617 /* Remove mtedesc from the normal sve descriptor. */ 6618 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6619 6620 /* 6621 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6622 * offset base entirely over the address space hole to change the 6623 * pointer tag, or change the bit55 selector. So we could here 6624 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6625 */ 6626 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6627 esize, msize, off_fn, host_fn, tlb_fn); 6628 } 6629 6630 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6631 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6632 void *vm, target_ulong base, uint32_t desc) \ 6633 { \ 6634 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6635 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6636 } \ 6637 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6638 void *vm, target_ulong base, uint32_t desc) \ 6639 { \ 6640 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6641 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6642 } 6643 6644 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6645 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6646 void *vm, target_ulong base, uint32_t desc) \ 6647 { \ 6648 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6649 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6650 } \ 6651 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6652 void *vm, target_ulong base, uint32_t desc) \ 6653 { \ 6654 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6655 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6656 } 6657 6658 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6659 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6660 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6661 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6662 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6663 6664 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6665 DO_LD1_ZPZ_S(bss, zss, MO_8) 6666 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6667 DO_LD1_ZPZ_D(bds, zss, MO_8) 6668 DO_LD1_ZPZ_D(bds, zd, MO_8) 6669 6670 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6671 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6672 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6673 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6674 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6675 6676 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6677 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6678 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6679 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6680 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6681 6682 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6683 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6684 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6685 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6686 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6687 6688 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6689 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6690 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6691 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6692 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6693 6694 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6695 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6696 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6697 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6698 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6699 6700 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6701 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6702 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6703 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6704 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6705 6706 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6707 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6708 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6709 6710 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6711 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6712 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6713 6714 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6715 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6716 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6717 6718 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6719 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6720 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6721 6722 #undef DO_LD1_ZPZ_S 6723 #undef DO_LD1_ZPZ_D 6724 6725 /* First fault loads with a vector index. */ 6726 6727 /* 6728 * Common helpers for all gather first-faulting loads. 6729 */ 6730 6731 static inline QEMU_ALWAYS_INLINE 6732 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6733 target_ulong base, uint32_t desc, uintptr_t retaddr, 6734 uint32_t mtedesc, const int esz, const int msz, 6735 zreg_off_fn *off_fn, 6736 sve_ldst1_host_fn *host_fn, 6737 sve_ldst1_tlb_fn *tlb_fn) 6738 { 6739 const int mmu_idx = arm_env_mmu_index(env); 6740 const intptr_t reg_max = simd_oprsz(desc); 6741 const int scale = simd_data(desc); 6742 const int esize = 1 << esz; 6743 const int msize = 1 << msz; 6744 intptr_t reg_off; 6745 SVEHostPage info; 6746 target_ulong addr, in_page; 6747 ARMVectorReg scratch; 6748 6749 /* Skip to the first true predicate. */ 6750 reg_off = find_next_active(vg, 0, reg_max, esz); 6751 if (unlikely(reg_off >= reg_max)) { 6752 /* The entire predicate was false; no load occurs. */ 6753 memset(vd, 0, reg_max); 6754 return; 6755 } 6756 6757 /* Protect against overlap between vd and vm. */ 6758 if (unlikely(vd == vm)) { 6759 vm = memcpy(&scratch, vm, reg_max); 6760 } 6761 6762 /* 6763 * Probe the first element, allowing faults. 6764 */ 6765 addr = base + (off_fn(vm, reg_off) << scale); 6766 if (mtedesc) { 6767 mte_check(env, mtedesc, addr, retaddr); 6768 } 6769 tlb_fn(env, vd, reg_off, addr, retaddr); 6770 6771 /* After any fault, zero the other elements. */ 6772 swap_memzero(vd, reg_off); 6773 reg_off += esize; 6774 swap_memzero(vd + reg_off, reg_max - reg_off); 6775 6776 /* 6777 * Probe the remaining elements, not allowing faults. 6778 */ 6779 while (reg_off < reg_max) { 6780 uint64_t pg = vg[reg_off >> 6]; 6781 do { 6782 if (likely((pg >> (reg_off & 63)) & 1)) { 6783 addr = base + (off_fn(vm, reg_off) << scale); 6784 in_page = -(addr | TARGET_PAGE_MASK); 6785 6786 if (unlikely(in_page < msize)) { 6787 /* Stop if the element crosses a page boundary. */ 6788 goto fault; 6789 } 6790 6791 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6792 mmu_idx, retaddr); 6793 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6794 goto fault; 6795 } 6796 if (unlikely(info.flags & TLB_WATCHPOINT) && 6797 (cpu_watchpoint_address_matches 6798 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 6799 goto fault; 6800 } 6801 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 6802 goto fault; 6803 } 6804 6805 set_helper_retaddr(retaddr); 6806 host_fn(vd, reg_off, info.host); 6807 clear_helper_retaddr(); 6808 } 6809 reg_off += esize; 6810 } while (reg_off & 63); 6811 } 6812 return; 6813 6814 fault: 6815 record_fault(env, reg_off, reg_max); 6816 } 6817 6818 static inline QEMU_ALWAYS_INLINE 6819 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6820 target_ulong base, uint32_t desc, uintptr_t retaddr, 6821 const int esz, const int msz, 6822 zreg_off_fn *off_fn, 6823 sve_ldst1_host_fn *host_fn, 6824 sve_ldst1_tlb_fn *tlb_fn) 6825 { 6826 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6827 /* Remove mtedesc from the normal sve descriptor. */ 6828 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6829 6830 /* 6831 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6832 * offset base entirely over the address space hole to change the 6833 * pointer tag, or change the bit55 selector. So we could here 6834 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6835 */ 6836 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6837 esz, msz, off_fn, host_fn, tlb_fn); 6838 } 6839 6840 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 6841 void HELPER(sve_ldff##MEM##_##OFS) \ 6842 (CPUARMState *env, void *vd, void *vg, \ 6843 void *vm, target_ulong base, uint32_t desc) \ 6844 { \ 6845 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 6846 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6847 } \ 6848 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6849 (CPUARMState *env, void *vd, void *vg, \ 6850 void *vm, target_ulong base, uint32_t desc) \ 6851 { \ 6852 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 6853 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6854 } 6855 6856 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 6857 void HELPER(sve_ldff##MEM##_##OFS) \ 6858 (CPUARMState *env, void *vd, void *vg, \ 6859 void *vm, target_ulong base, uint32_t desc) \ 6860 { \ 6861 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 6862 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6863 } \ 6864 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6865 (CPUARMState *env, void *vd, void *vg, \ 6866 void *vm, target_ulong base, uint32_t desc) \ 6867 { \ 6868 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 6869 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6870 } 6871 6872 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 6873 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 6874 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 6875 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 6876 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 6877 6878 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 6879 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 6880 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 6881 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 6882 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 6883 6884 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 6885 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 6886 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 6887 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 6888 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 6889 6890 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 6891 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 6892 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 6893 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 6894 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 6895 6896 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 6897 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 6898 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 6899 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 6900 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 6901 6902 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 6903 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 6904 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 6905 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 6906 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 6907 6908 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 6909 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 6910 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 6911 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 6912 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 6913 6914 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 6915 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 6916 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 6917 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 6918 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 6919 6920 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 6921 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 6922 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 6923 6924 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 6925 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 6926 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 6927 6928 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 6929 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 6930 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 6931 6932 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 6933 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 6934 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 6935 6936 /* Stores with a vector index. */ 6937 6938 static inline QEMU_ALWAYS_INLINE 6939 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6940 target_ulong base, uint32_t desc, uintptr_t retaddr, 6941 uint32_t mtedesc, int esize, int msize, 6942 zreg_off_fn *off_fn, 6943 sve_ldst1_host_fn *host_fn, 6944 sve_ldst1_tlb_fn *tlb_fn) 6945 { 6946 const int mmu_idx = arm_env_mmu_index(env); 6947 const intptr_t reg_max = simd_oprsz(desc); 6948 const int scale = simd_data(desc); 6949 void *host[ARM_MAX_VQ * 4]; 6950 intptr_t reg_off, i; 6951 SVEHostPage info, info2; 6952 6953 /* 6954 * Probe all of the elements for host addresses and flags. 6955 */ 6956 i = reg_off = 0; 6957 do { 6958 uint64_t pg = vg[reg_off >> 6]; 6959 do { 6960 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6961 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6962 6963 host[i] = NULL; 6964 if (likely((pg >> (reg_off & 63)) & 1)) { 6965 if (likely(in_page >= msize)) { 6966 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 6967 mmu_idx, retaddr); 6968 if (!(info.flags & TLB_MMIO)) { 6969 host[i] = info.host; 6970 } 6971 } else { 6972 /* 6973 * Element crosses the page boundary. 6974 * Probe both pages, but do not record the host address, 6975 * so that we use the slow path. 6976 */ 6977 sve_probe_page(&info, false, env, addr, 0, 6978 MMU_DATA_STORE, mmu_idx, retaddr); 6979 sve_probe_page(&info2, false, env, addr + in_page, 0, 6980 MMU_DATA_STORE, mmu_idx, retaddr); 6981 info.flags |= info2.flags; 6982 } 6983 6984 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6985 cpu_check_watchpoint(env_cpu(env), addr, msize, 6986 info.attrs, BP_MEM_WRITE, retaddr); 6987 } 6988 6989 if (mtedesc && info.tagged) { 6990 mte_check(env, mtedesc, addr, retaddr); 6991 } 6992 } 6993 i += 1; 6994 reg_off += esize; 6995 } while (reg_off & 63); 6996 } while (reg_off < reg_max); 6997 6998 /* 6999 * Now that we have recognized all exceptions except SyncExternal 7000 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7001 * 7002 * Note for the common case of an element in RAM, not crossing a page 7003 * boundary, we have stored the host address in host[]. This doubles 7004 * as a first-level check against the predicate, since only enabled 7005 * elements have non-null host addresses. 7006 */ 7007 i = reg_off = 0; 7008 do { 7009 void *h = host[i]; 7010 if (likely(h != NULL)) { 7011 set_helper_retaddr(retaddr); 7012 host_fn(vd, reg_off, h); 7013 clear_helper_retaddr(); 7014 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7015 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7016 tlb_fn(env, vd, reg_off, addr, retaddr); 7017 } 7018 i += 1; 7019 reg_off += esize; 7020 } while (reg_off < reg_max); 7021 } 7022 7023 static inline QEMU_ALWAYS_INLINE 7024 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7025 target_ulong base, uint32_t desc, uintptr_t retaddr, 7026 int esize, int msize, zreg_off_fn *off_fn, 7027 sve_ldst1_host_fn *host_fn, 7028 sve_ldst1_tlb_fn *tlb_fn) 7029 { 7030 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7031 /* Remove mtedesc from the normal sve descriptor. */ 7032 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7033 7034 /* 7035 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7036 * offset base entirely over the address space hole to change the 7037 * pointer tag, or change the bit55 selector. So we could here 7038 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7039 */ 7040 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7041 esize, msize, off_fn, host_fn, tlb_fn); 7042 } 7043 7044 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7045 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7046 void *vm, target_ulong base, uint32_t desc) \ 7047 { \ 7048 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7049 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7050 } \ 7051 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7052 void *vm, target_ulong base, uint32_t desc) \ 7053 { \ 7054 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7055 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7056 } 7057 7058 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7059 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7060 void *vm, target_ulong base, uint32_t desc) \ 7061 { \ 7062 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7063 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7064 } \ 7065 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7066 void *vm, target_ulong base, uint32_t desc) \ 7067 { \ 7068 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7069 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7070 } 7071 7072 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7073 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7074 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7075 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7076 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7077 7078 DO_ST1_ZPZ_S(bs, zss, MO_8) 7079 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7080 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7081 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7082 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7083 7084 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7085 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7086 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7087 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7088 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7089 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7090 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7091 7092 DO_ST1_ZPZ_D(bd, zss, MO_8) 7093 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7094 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7095 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7096 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7097 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7098 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7099 7100 DO_ST1_ZPZ_D(bd, zd, MO_8) 7101 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7102 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7103 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7104 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7105 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7106 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7107 7108 #undef DO_ST1_ZPZ_S 7109 #undef DO_ST1_ZPZ_D 7110 7111 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7112 { 7113 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7114 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7115 7116 for (i = 0; i < opr_sz; ++i) { 7117 d[i] = n[i] ^ m[i] ^ k[i]; 7118 } 7119 } 7120 7121 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7122 { 7123 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7124 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7125 7126 for (i = 0; i < opr_sz; ++i) { 7127 d[i] = n[i] ^ (m[i] & ~k[i]); 7128 } 7129 } 7130 7131 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7132 { 7133 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7134 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7135 7136 for (i = 0; i < opr_sz; ++i) { 7137 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7138 } 7139 } 7140 7141 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7142 { 7143 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7144 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7145 7146 for (i = 0; i < opr_sz; ++i) { 7147 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7148 } 7149 } 7150 7151 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7152 { 7153 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7154 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7155 7156 for (i = 0; i < opr_sz; ++i) { 7157 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7158 } 7159 } 7160 7161 /* 7162 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7163 * See hasless(v,1) from 7164 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7165 */ 7166 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7167 { 7168 int bits = 8 << esz; 7169 uint64_t ones = dup_const(esz, 1); 7170 uint64_t signs = ones << (bits - 1); 7171 uint64_t cmp0, cmp1; 7172 7173 cmp1 = dup_const(esz, n); 7174 cmp0 = cmp1 ^ m0; 7175 cmp1 = cmp1 ^ m1; 7176 cmp0 = (cmp0 - ones) & ~cmp0; 7177 cmp1 = (cmp1 - ones) & ~cmp1; 7178 return (cmp0 | cmp1) & signs; 7179 } 7180 7181 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7182 uint32_t desc, int esz, bool nmatch) 7183 { 7184 uint16_t esz_mask = pred_esz_masks[esz]; 7185 intptr_t opr_sz = simd_oprsz(desc); 7186 uint32_t flags = PREDTEST_INIT; 7187 intptr_t i, j, k; 7188 7189 for (i = 0; i < opr_sz; i += 16) { 7190 uint64_t m0 = *(uint64_t *)(vm + i); 7191 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7192 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7193 uint16_t out = 0; 7194 7195 for (j = 0; j < 16; j += 8) { 7196 uint64_t n = *(uint64_t *)(vn + i + j); 7197 7198 for (k = 0; k < 8; k += 1 << esz) { 7199 if (pg & (1 << (j + k))) { 7200 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7201 out |= (o ^ nmatch) << (j + k); 7202 } 7203 } 7204 } 7205 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7206 flags = iter_predtest_fwd(out, pg, flags); 7207 } 7208 return flags; 7209 } 7210 7211 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7212 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7213 { \ 7214 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7215 } 7216 7217 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7218 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7219 7220 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7221 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7222 7223 #undef DO_PPZZ_MATCH 7224 7225 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7226 uint32_t desc) 7227 { 7228 ARMVectorReg scratch; 7229 intptr_t i, j; 7230 intptr_t opr_sz = simd_oprsz(desc); 7231 uint32_t *d = vd, *n = vn, *m = vm; 7232 uint8_t *pg = vg; 7233 7234 if (d == n) { 7235 n = memcpy(&scratch, n, opr_sz); 7236 if (d == m) { 7237 m = n; 7238 } 7239 } else if (d == m) { 7240 m = memcpy(&scratch, m, opr_sz); 7241 } 7242 7243 for (i = 0; i < opr_sz; i += 4) { 7244 uint64_t count = 0; 7245 uint8_t pred; 7246 7247 pred = pg[H1(i >> 3)] >> (i & 7); 7248 if (pred & 1) { 7249 uint32_t nn = n[H4(i >> 2)]; 7250 7251 for (j = 0; j <= i; j += 4) { 7252 pred = pg[H1(j >> 3)] >> (j & 7); 7253 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7254 ++count; 7255 } 7256 } 7257 } 7258 d[H4(i >> 2)] = count; 7259 } 7260 } 7261 7262 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7263 uint32_t desc) 7264 { 7265 ARMVectorReg scratch; 7266 intptr_t i, j; 7267 intptr_t opr_sz = simd_oprsz(desc); 7268 uint64_t *d = vd, *n = vn, *m = vm; 7269 uint8_t *pg = vg; 7270 7271 if (d == n) { 7272 n = memcpy(&scratch, n, opr_sz); 7273 if (d == m) { 7274 m = n; 7275 } 7276 } else if (d == m) { 7277 m = memcpy(&scratch, m, opr_sz); 7278 } 7279 7280 for (i = 0; i < opr_sz / 8; ++i) { 7281 uint64_t count = 0; 7282 if (pg[H1(i)] & 1) { 7283 uint64_t nn = n[i]; 7284 for (j = 0; j <= i; ++j) { 7285 if ((pg[H1(j)] & 1) && nn == m[j]) { 7286 ++count; 7287 } 7288 } 7289 } 7290 d[i] = count; 7291 } 7292 } 7293 7294 /* 7295 * Returns the number of bytes in m0 and m1 that match n. 7296 * Unlike do_match2 we don't just need true/false, we need an exact count. 7297 * This requires two extra logical operations. 7298 */ 7299 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7300 { 7301 const uint64_t mask = dup_const(MO_8, 0x7f); 7302 uint64_t cmp0, cmp1; 7303 7304 cmp1 = dup_const(MO_8, n); 7305 cmp0 = cmp1 ^ m0; 7306 cmp1 = cmp1 ^ m1; 7307 7308 /* 7309 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7310 * 2: carry in to msb if byte != 0 (+ mask) 7311 * 3: set msb if cmp has msb set (| cmp) 7312 * 4: set ~msb to ignore them (| mask) 7313 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7314 * 5: invert, resulting in 0x80 if and only if byte == 0. 7315 */ 7316 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7317 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7318 7319 /* 7320 * Combine the two compares in a way that the bits do 7321 * not overlap, and so preserves the count of set bits. 7322 * If the host has an efficient instruction for ctpop, 7323 * then ctpop(x) + ctpop(y) has the same number of 7324 * operations as ctpop(x | (y >> 1)). If the host does 7325 * not have an efficient ctpop, then we only want to 7326 * use it once. 7327 */ 7328 return ctpop64(cmp0 | (cmp1 >> 1)); 7329 } 7330 7331 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7332 { 7333 intptr_t i, j; 7334 intptr_t opr_sz = simd_oprsz(desc); 7335 7336 for (i = 0; i < opr_sz; i += 16) { 7337 uint64_t n0 = *(uint64_t *)(vn + i); 7338 uint64_t m0 = *(uint64_t *)(vm + i); 7339 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7340 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7341 uint64_t out0 = 0; 7342 uint64_t out1 = 0; 7343 7344 for (j = 0; j < 64; j += 8) { 7345 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7346 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7347 out0 |= cnt0 << j; 7348 out1 |= cnt1 << j; 7349 } 7350 7351 *(uint64_t *)(vd + i) = out0; 7352 *(uint64_t *)(vd + i + 8) = out1; 7353 } 7354 } 7355 7356 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7357 { 7358 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7359 int shr = simd_data(desc); 7360 int shl = 8 - shr; 7361 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7362 uint64_t *d = vd, *n = vn, *m = vm; 7363 7364 for (i = 0; i < opr_sz; ++i) { 7365 uint64_t t = n[i] ^ m[i]; 7366 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7367 } 7368 } 7369 7370 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7371 { 7372 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7373 int shr = simd_data(desc); 7374 int shl = 16 - shr; 7375 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7376 uint64_t *d = vd, *n = vn, *m = vm; 7377 7378 for (i = 0; i < opr_sz; ++i) { 7379 uint64_t t = n[i] ^ m[i]; 7380 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7381 } 7382 } 7383 7384 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7385 { 7386 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7387 int shr = simd_data(desc); 7388 uint32_t *d = vd, *n = vn, *m = vm; 7389 7390 for (i = 0; i < opr_sz; ++i) { 7391 d[i] = ror32(n[i] ^ m[i], shr); 7392 } 7393 } 7394 7395 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7396 float_status *status, uint32_t desc) 7397 { 7398 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7399 7400 for (s = 0; s < opr_sz; ++s) { 7401 float32 *n = vn + s * sizeof(float32) * 4; 7402 float32 *m = vm + s * sizeof(float32) * 4; 7403 float32 *a = va + s * sizeof(float32) * 4; 7404 float32 *d = vd + s * sizeof(float32) * 4; 7405 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7406 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7407 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7408 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7409 float32 p0, p1; 7410 7411 /* i = 0, j = 0 */ 7412 p0 = float32_mul(n00, m00, status); 7413 p1 = float32_mul(n01, m01, status); 7414 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7415 7416 /* i = 0, j = 1 */ 7417 p0 = float32_mul(n00, m10, status); 7418 p1 = float32_mul(n01, m11, status); 7419 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7420 7421 /* i = 1, j = 0 */ 7422 p0 = float32_mul(n10, m00, status); 7423 p1 = float32_mul(n11, m01, status); 7424 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7425 7426 /* i = 1, j = 1 */ 7427 p0 = float32_mul(n10, m10, status); 7428 p1 = float32_mul(n11, m11, status); 7429 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7430 } 7431 } 7432 7433 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7434 float_status *status, uint32_t desc) 7435 { 7436 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7437 7438 for (s = 0; s < opr_sz; ++s) { 7439 float64 *n = vn + s * sizeof(float64) * 4; 7440 float64 *m = vm + s * sizeof(float64) * 4; 7441 float64 *a = va + s * sizeof(float64) * 4; 7442 float64 *d = vd + s * sizeof(float64) * 4; 7443 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7444 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7445 float64 p0, p1; 7446 7447 /* i = 0, j = 0 */ 7448 p0 = float64_mul(n00, m00, status); 7449 p1 = float64_mul(n01, m01, status); 7450 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7451 7452 /* i = 0, j = 1 */ 7453 p0 = float64_mul(n00, m10, status); 7454 p1 = float64_mul(n01, m11, status); 7455 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7456 7457 /* i = 1, j = 0 */ 7458 p0 = float64_mul(n10, m00, status); 7459 p1 = float64_mul(n11, m01, status); 7460 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7461 7462 /* i = 1, j = 1 */ 7463 p0 = float64_mul(n10, m10, status); 7464 p1 = float64_mul(n11, m11, status); 7465 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7466 } 7467 } 7468 7469 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7470 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7471 float_status *status, uint32_t desc) \ 7472 { \ 7473 intptr_t i = simd_oprsz(desc); \ 7474 uint64_t *g = vg; \ 7475 do { \ 7476 uint64_t pg = g[(i - 1) >> 6]; \ 7477 do { \ 7478 i -= sizeof(TYPEW); \ 7479 if (likely((pg >> (i & 63)) & 1)) { \ 7480 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7481 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7482 } \ 7483 } while (i & 63); \ 7484 } while (i != 0); \ 7485 } 7486 7487 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7488 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7489 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7490 7491 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7492 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7493 float_status *status, uint32_t desc) \ 7494 { \ 7495 intptr_t i = simd_oprsz(desc); \ 7496 uint64_t *g = vg; \ 7497 do { \ 7498 uint64_t pg = g[(i - 1) >> 6]; \ 7499 do { \ 7500 i -= sizeof(TYPEW); \ 7501 if (likely((pg >> (i & 63)) & 1)) { \ 7502 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7503 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7504 } \ 7505 } while (i & 63); \ 7506 } while (i != 0); \ 7507 } 7508 7509 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7510 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7511 7512 #undef DO_FCVTLT 7513 #undef DO_FCVTNT 7514