1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/page-protection.h" 24 #include "exec/helper-proto.h" 25 #include "exec/target_page.h" 26 #include "exec/tlb-flags.h" 27 #include "tcg/tcg-gvec-desc.h" 28 #include "fpu/softfloat.h" 29 #include "tcg/tcg.h" 30 #include "vec_internal.h" 31 #include "sve_ldst_internal.h" 32 #include "accel/tcg/cpu-ldst.h" 33 #include "accel/tcg/helper-retaddr.h" 34 #include "accel/tcg/cpu-ops.h" 35 #include "accel/tcg/probe.h" 36 #ifdef CONFIG_USER_ONLY 37 #include "user/page-protection.h" 38 #endif 39 40 41 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 42 * 43 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 44 * and bit 0 set if C is set. Compare the definitions of these variables 45 * within CPUARMState. 46 */ 47 48 /* For no G bits set, NZCV = C. */ 49 #define PREDTEST_INIT 1 50 51 /* This is an iterative function, called for each Pd and Pg word 52 * moving forward. 53 */ 54 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 55 { 56 if (likely(g)) { 57 /* Compute N from first D & G. 58 Use bit 2 to signal first G bit seen. */ 59 if (!(flags & 4)) { 60 flags |= ((d & (g & -g)) != 0) << 31; 61 flags |= 4; 62 } 63 64 /* Accumulate Z from each D & G. */ 65 flags |= ((d & g) != 0) << 1; 66 67 /* Compute C from last !(D & G). Replace previous. */ 68 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 69 } 70 return flags; 71 } 72 73 /* This is an iterative function, called for each Pd and Pg word 74 * moving backward. 75 */ 76 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 77 { 78 if (likely(g)) { 79 /* Compute C from first (i.e last) !(D & G). 80 Use bit 2 to signal first G bit seen. */ 81 if (!(flags & 4)) { 82 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 83 flags |= (d & pow2floor(g)) == 0; 84 } 85 86 /* Accumulate Z from each D & G. */ 87 flags |= ((d & g) != 0) << 1; 88 89 /* Compute N from last (i.e first) D & G. Replace previous. */ 90 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 91 } 92 return flags; 93 } 94 95 /* The same for a single word predicate. */ 96 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 97 { 98 return iter_predtest_fwd(d, g, PREDTEST_INIT); 99 } 100 101 /* The same for a multi-word predicate. */ 102 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 103 { 104 uint32_t flags = PREDTEST_INIT; 105 uint64_t *d = vd, *g = vg; 106 uintptr_t i = 0; 107 108 do { 109 flags = iter_predtest_fwd(d[i], g[i], flags); 110 } while (++i < words); 111 112 return flags; 113 } 114 115 /* Similarly for single word elements. */ 116 static inline uint64_t expand_pred_s(uint8_t byte) 117 { 118 static const uint64_t word[] = { 119 [0x01] = 0x00000000ffffffffull, 120 [0x10] = 0xffffffff00000000ull, 121 [0x11] = 0xffffffffffffffffull, 122 }; 123 return word[byte & 0x11]; 124 } 125 126 static inline uint64_t expand_pred_d(uint8_t byte) 127 { 128 return -(uint64_t)(byte & 1); 129 } 130 131 #define LOGICAL_PPPP(NAME, FUNC) \ 132 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 133 { \ 134 uintptr_t opr_sz = simd_oprsz(desc); \ 135 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 136 uintptr_t i; \ 137 for (i = 0; i < opr_sz / 8; ++i) { \ 138 d[i] = FUNC(n[i], m[i], g[i]); \ 139 } \ 140 } 141 142 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 143 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 144 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 145 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 146 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 147 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 148 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 149 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 150 151 LOGICAL_PPPP(sve_and_pppp, DO_AND) 152 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 153 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 154 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 155 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 156 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 157 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 158 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 159 160 #undef DO_AND 161 #undef DO_BIC 162 #undef DO_EOR 163 #undef DO_ORR 164 #undef DO_ORN 165 #undef DO_NOR 166 #undef DO_NAND 167 #undef DO_SEL 168 #undef LOGICAL_PPPP 169 170 /* Fully general three-operand expander, controlled by a predicate. 171 * This is complicated by the host-endian storage of the register file. 172 */ 173 /* ??? I don't expect the compiler could ever vectorize this itself. 174 * With some tables we can convert bit masks to byte masks, and with 175 * extra care wrt byte/word ordering we could use gcc generic vectors 176 * and do 16 bytes at a time. 177 */ 178 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 179 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 180 { \ 181 intptr_t i, opr_sz = simd_oprsz(desc); \ 182 for (i = 0; i < opr_sz; ) { \ 183 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 184 do { \ 185 if (pg & 1) { \ 186 TYPE nn = *(TYPE *)(vn + H(i)); \ 187 TYPE mm = *(TYPE *)(vm + H(i)); \ 188 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 189 } \ 190 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 191 } while (i & 15); \ 192 } \ 193 } 194 195 /* Similarly, specialized for 64-bit operands. */ 196 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 197 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 198 { \ 199 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 200 TYPE *d = vd, *n = vn, *m = vm; \ 201 uint8_t *pg = vg; \ 202 for (i = 0; i < opr_sz; i += 1) { \ 203 if (pg[H1(i)] & 1) { \ 204 TYPE nn = n[i], mm = m[i]; \ 205 d[i] = OP(nn, mm); \ 206 } \ 207 } \ 208 } 209 210 #define DO_AND(N, M) (N & M) 211 #define DO_EOR(N, M) (N ^ M) 212 #define DO_ORR(N, M) (N | M) 213 #define DO_BIC(N, M) (N & ~M) 214 #define DO_ORC(N, M) (N | ~M) 215 #define DO_ADD(N, M) (N + M) 216 #define DO_SUB(N, M) (N - M) 217 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 218 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 219 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 220 #define DO_MUL(N, M) (N * M) 221 222 223 /* 224 * We must avoid the C undefined behaviour cases: division by 225 * zero and signed division of INT_MIN by -1. Both of these 226 * have architecturally defined required results for Arm. 227 * We special case all signed divisions by -1 to avoid having 228 * to deduce the minimum integer for the type involved. 229 */ 230 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 231 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 232 233 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 234 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 235 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 236 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 237 238 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 239 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 240 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 241 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 242 243 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 244 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 245 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 246 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 247 248 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 249 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 250 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 251 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 252 253 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 254 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 255 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 256 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 257 258 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 259 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 260 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 261 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 262 263 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 264 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 265 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 266 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 267 268 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 269 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 270 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 271 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 272 273 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 274 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 275 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 276 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 277 278 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 279 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 280 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 281 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 282 283 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 284 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 285 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 286 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 287 288 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 289 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 290 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 291 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 292 293 /* Because the computation type is at least twice as large as required, 294 these work for both signed and unsigned source types. */ 295 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 296 { 297 return (n * m) >> 8; 298 } 299 300 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 301 { 302 return (n * m) >> 16; 303 } 304 305 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 306 { 307 return (n * m) >> 32; 308 } 309 310 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 311 { 312 uint64_t lo, hi; 313 muls64(&lo, &hi, n, m); 314 return hi; 315 } 316 317 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 318 { 319 uint64_t lo, hi; 320 mulu64(&lo, &hi, n, m); 321 return hi; 322 } 323 324 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 325 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 326 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 327 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 328 329 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 330 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 331 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 332 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 333 334 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 335 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 336 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 337 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 338 339 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 340 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 341 342 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 343 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 344 345 /* Note that all bits of the shift are significant 346 and not modulo the element size. */ 347 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 348 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 349 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 350 351 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 352 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 353 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 354 355 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 356 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 357 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 358 359 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 360 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 361 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 362 363 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 364 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 365 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 366 367 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 368 { 369 int8_t n1 = n, n2 = n >> 8; 370 return m + n1 + n2; 371 } 372 373 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 374 { 375 int16_t n1 = n, n2 = n >> 16; 376 return m + n1 + n2; 377 } 378 379 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 380 { 381 int32_t n1 = n, n2 = n >> 32; 382 return m + n1 + n2; 383 } 384 385 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 386 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 387 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 388 389 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 390 { 391 uint8_t n1 = n, n2 = n >> 8; 392 return m + n1 + n2; 393 } 394 395 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 396 { 397 uint16_t n1 = n, n2 = n >> 16; 398 return m + n1 + n2; 399 } 400 401 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 402 { 403 uint32_t n1 = n, n2 = n >> 32; 404 return m + n1 + n2; 405 } 406 407 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 408 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 409 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 410 411 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 412 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 413 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 414 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 415 416 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 417 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 418 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 419 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 420 421 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 422 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 423 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 424 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 425 426 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 427 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 428 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 429 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 430 431 /* 432 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 433 * We pass in a pointer to a dummy saturation field to trigger 434 * the saturating arithmetic but discard the information about 435 * whether it has occurred. 436 */ 437 #define do_sqshl_b(n, m) \ 438 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 439 #define do_sqshl_h(n, m) \ 440 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 441 #define do_sqshl_s(n, m) \ 442 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 443 #define do_sqshl_d(n, m) \ 444 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 445 446 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 447 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 448 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 449 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 450 451 #define do_uqshl_b(n, m) \ 452 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 453 #define do_uqshl_h(n, m) \ 454 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 455 #define do_uqshl_s(n, m) \ 456 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 457 #define do_uqshl_d(n, m) \ 458 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 459 460 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 461 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 462 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 463 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 464 465 #define do_sqrshl_b(n, m) \ 466 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 467 #define do_sqrshl_h(n, m) \ 468 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 469 #define do_sqrshl_s(n, m) \ 470 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 471 #define do_sqrshl_d(n, m) \ 472 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 473 474 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 475 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 476 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 477 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 478 479 #undef do_sqrshl_d 480 481 #define do_uqrshl_b(n, m) \ 482 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 483 #define do_uqrshl_h(n, m) \ 484 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 485 #define do_uqrshl_s(n, m) \ 486 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 487 #define do_uqrshl_d(n, m) \ 488 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 489 490 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 491 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 492 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 493 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 494 495 #undef do_uqrshl_d 496 497 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 498 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 499 500 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 501 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 502 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 503 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 504 505 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 506 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 507 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 508 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 509 510 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 511 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 512 513 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 514 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 515 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 516 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 517 518 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 519 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 520 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 521 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 522 523 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 524 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 525 526 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 527 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 528 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 529 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 530 531 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 532 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 533 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 534 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 535 536 #define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m) 537 #define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m) 538 #define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m) 539 540 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 541 { 542 int64_t r = n + m; 543 if (((r ^ n) & ~(n ^ m)) < 0) { 544 /* Signed overflow. */ 545 return r < 0 ? INT64_MAX : INT64_MIN; 546 } 547 return r; 548 } 549 550 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 551 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 552 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 553 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 554 555 #define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m) 556 #define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m) 557 #define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m) 558 559 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 560 { 561 uint64_t r = n + m; 562 return r < n ? UINT64_MAX : r; 563 } 564 565 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 566 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 567 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 568 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 569 570 #define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m) 571 #define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m) 572 #define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m) 573 574 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 575 { 576 int64_t r = n - m; 577 if (((r ^ n) & (n ^ m)) < 0) { 578 /* Signed overflow. */ 579 return r < 0 ? INT64_MAX : INT64_MIN; 580 } 581 return r; 582 } 583 584 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 585 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 586 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 587 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 588 589 #define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m) 590 #define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m) 591 #define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m) 592 593 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 594 { 595 return n > m ? n - m : 0; 596 } 597 598 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 599 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 600 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 601 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 602 603 #define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m) 604 #define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m) 605 #define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m) 606 607 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 608 { 609 uint64_t r = n + m; 610 611 if (n < 0) { 612 /* Note that m - abs(n) cannot underflow. */ 613 if (r > INT64_MAX) { 614 /* Result is either very large positive or negative. */ 615 if (m > -n) { 616 /* m > abs(n), so r is a very large positive. */ 617 return INT64_MAX; 618 } 619 /* Result is negative. */ 620 } 621 } else { 622 /* Both inputs are positive: check for overflow. */ 623 if (r < m || r > INT64_MAX) { 624 return INT64_MAX; 625 } 626 } 627 return r; 628 } 629 630 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 631 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 632 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 633 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 634 635 #define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m) 636 #define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m) 637 #define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m) 638 639 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 640 { 641 uint64_t r = n + m; 642 643 if (m < 0) { 644 return n < -m ? 0 : r; 645 } 646 return r < n ? UINT64_MAX : r; 647 } 648 649 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 650 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 651 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 652 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 653 654 #undef DO_ZPZZ 655 #undef DO_ZPZZ_D 656 657 /* 658 * Three operand expander, operating on element pairs. 659 * If the slot I is even, the elements from from VN {I, I+1}. 660 * If the slot I is odd, the elements from from VM {I-1, I}. 661 * Load all of the input elements in each pair before overwriting output. 662 */ 663 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 664 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 665 { \ 666 intptr_t i, opr_sz = simd_oprsz(desc); \ 667 for (i = 0; i < opr_sz; ) { \ 668 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 669 do { \ 670 TYPE n0 = *(TYPE *)(vn + H(i)); \ 671 TYPE m0 = *(TYPE *)(vm + H(i)); \ 672 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 673 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 674 if (pg & 1) { \ 675 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 676 } \ 677 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 678 if (pg & 1) { \ 679 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 680 } \ 681 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 682 } while (i & 15); \ 683 } \ 684 } 685 686 /* Similarly, specialized for 64-bit operands. */ 687 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 688 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 689 { \ 690 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 691 TYPE *d = vd, *n = vn, *m = vm; \ 692 uint8_t *pg = vg; \ 693 for (i = 0; i < opr_sz; i += 2) { \ 694 TYPE n0 = n[i], n1 = n[i + 1]; \ 695 TYPE m0 = m[i], m1 = m[i + 1]; \ 696 if (pg[H1(i)] & 1) { \ 697 d[i] = OP(n0, n1); \ 698 } \ 699 if (pg[H1(i + 1)] & 1) { \ 700 d[i + 1] = OP(m0, m1); \ 701 } \ 702 } \ 703 } 704 705 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 706 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 707 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 708 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 709 710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 713 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 714 715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 718 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 719 720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 723 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 724 725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 728 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 729 730 #undef DO_ZPZZ_PAIR 731 #undef DO_ZPZZ_PAIR_D 732 733 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 734 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 735 float_status *status, uint32_t desc) \ 736 { \ 737 intptr_t i, opr_sz = simd_oprsz(desc); \ 738 for (i = 0; i < opr_sz; ) { \ 739 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 740 do { \ 741 TYPE n0 = *(TYPE *)(vn + H(i)); \ 742 TYPE m0 = *(TYPE *)(vm + H(i)); \ 743 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 744 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 745 if (pg & 1) { \ 746 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 747 } \ 748 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 749 if (pg & 1) { \ 750 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 751 } \ 752 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 753 } while (i & 15); \ 754 } \ 755 } 756 757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 760 761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 764 765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 768 769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 772 773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 776 777 #undef DO_ZPZZ_PAIR_FP 778 779 /* Three-operand expander, controlled by a predicate, in which the 780 * third operand is "wide". That is, for D = N op M, the same 64-bit 781 * value of M is used with all of the narrower values of N. 782 */ 783 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 784 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 785 { \ 786 intptr_t i, opr_sz = simd_oprsz(desc); \ 787 for (i = 0; i < opr_sz; ) { \ 788 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 789 TYPEW mm = *(TYPEW *)(vm + i); \ 790 do { \ 791 if (pg & 1) { \ 792 TYPE nn = *(TYPE *)(vn + H(i)); \ 793 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 794 } \ 795 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 796 } while (i & 7); \ 797 } \ 798 } 799 800 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 801 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 802 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 803 804 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 805 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 806 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 807 808 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 809 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 810 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 811 812 #undef DO_ZPZW 813 814 /* Fully general two-operand expander, controlled by a predicate. 815 */ 816 #define DO_ZPZ(NAME, TYPE, H, OP) \ 817 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 818 { \ 819 intptr_t i, opr_sz = simd_oprsz(desc); \ 820 for (i = 0; i < opr_sz; ) { \ 821 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 822 do { \ 823 if (pg & 1) { \ 824 TYPE nn = *(TYPE *)(vn + H(i)); \ 825 *(TYPE *)(vd + H(i)) = OP(nn); \ 826 } \ 827 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 828 } while (i & 15); \ 829 } \ 830 } 831 832 /* Similarly, specialized for 64-bit operands. */ 833 #define DO_ZPZ_D(NAME, TYPE, OP) \ 834 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 835 { \ 836 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 837 TYPE *d = vd, *n = vn; \ 838 uint8_t *pg = vg; \ 839 for (i = 0; i < opr_sz; i += 1) { \ 840 if (pg[H1(i)] & 1) { \ 841 TYPE nn = n[i]; \ 842 d[i] = OP(nn); \ 843 } \ 844 } \ 845 } 846 847 #define DO_CLS_B(N) (clrsb32(N) - 24) 848 #define DO_CLS_H(N) (clrsb32(N) - 16) 849 850 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 851 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 852 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 853 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 854 855 #define DO_CLZ_B(N) (clz32(N) - 24) 856 #define DO_CLZ_H(N) (clz32(N) - 16) 857 858 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 859 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 860 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 861 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 862 863 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 864 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 865 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 866 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 867 868 #define DO_CNOT(N) (N == 0) 869 870 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 871 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 872 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 873 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 874 875 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 876 877 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 878 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 879 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 880 881 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N)) 882 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N)) 883 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N)) 884 885 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H) 886 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S) 887 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D) 888 889 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 890 891 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 892 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 893 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 894 895 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N)) 896 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N)) 897 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N)) 898 899 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H) 900 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S) 901 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D) 902 903 #define DO_NOT(N) (~N) 904 905 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 906 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 907 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 908 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 909 910 #define DO_SXTB(N) ((int8_t)N) 911 #define DO_SXTH(N) ((int16_t)N) 912 #define DO_SXTS(N) ((int32_t)N) 913 #define DO_UXTB(N) ((uint8_t)N) 914 #define DO_UXTH(N) ((uint16_t)N) 915 #define DO_UXTS(N) ((uint32_t)N) 916 917 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 918 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 919 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 920 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 921 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 922 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 923 924 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 925 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 926 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 927 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 928 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 929 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 930 931 #define DO_ABS(N) (N < 0 ? -N : N) 932 933 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 934 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 935 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 936 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 937 938 #define DO_NEG(N) (-N) 939 940 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 941 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 942 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 943 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 944 945 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 946 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 947 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 948 949 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 950 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 951 952 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 953 954 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 955 { 956 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 957 uint64_t *d = vd, *n = vn; 958 uint8_t *pg = vg; 959 960 for (i = 0; i < opr_sz; i += 2) { 961 if (pg[H1(i)] & 1) { 962 uint64_t n0 = n[i + 0]; 963 uint64_t n1 = n[i + 1]; 964 d[i + 0] = n1; 965 d[i + 1] = n0; 966 } 967 } 968 } 969 970 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 971 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 972 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 973 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 974 975 #define DO_SQABS(X) \ 976 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 977 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 978 979 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 980 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 981 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 982 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 983 984 #define DO_SQNEG(X) \ 985 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 986 x_ == min_ ? -min_ - 1 : -x_; }) 987 988 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 989 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 990 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 991 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 992 993 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 994 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 995 996 /* Three-operand expander, unpredicated, in which the third operand is "wide". 997 */ 998 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 999 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1000 { \ 1001 intptr_t i, opr_sz = simd_oprsz(desc); \ 1002 for (i = 0; i < opr_sz; ) { \ 1003 TYPEW mm = *(TYPEW *)(vm + i); \ 1004 do { \ 1005 TYPE nn = *(TYPE *)(vn + H(i)); \ 1006 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 1007 i += sizeof(TYPE); \ 1008 } while (i & 7); \ 1009 } \ 1010 } 1011 1012 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 1013 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 1014 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1015 1016 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1017 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1018 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1019 1020 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1021 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1022 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1023 1024 #undef DO_ZZW 1025 1026 #undef DO_CLS_B 1027 #undef DO_CLS_H 1028 #undef DO_CLZ_B 1029 #undef DO_CLZ_H 1030 #undef DO_CNOT 1031 #undef DO_FABS 1032 #undef DO_FNEG 1033 #undef DO_ABS 1034 #undef DO_NEG 1035 #undef DO_ZPZ 1036 #undef DO_ZPZ_D 1037 1038 /* 1039 * Three-operand expander, unpredicated, in which the two inputs are 1040 * selected from the top or bottom half of the wide column. 1041 */ 1042 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1043 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1044 { \ 1045 intptr_t i, opr_sz = simd_oprsz(desc); \ 1046 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1047 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1048 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1049 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1050 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1051 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1052 } \ 1053 } 1054 1055 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1056 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1057 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1058 1059 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1060 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1061 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1062 1063 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1064 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1065 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1066 1067 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1068 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1069 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1070 1071 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1072 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1073 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1074 1075 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1076 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1077 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1078 1079 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1080 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1081 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1082 1083 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1084 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1085 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1086 1087 /* Note that the multiply cannot overflow, but the doubling can. */ 1088 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1089 { 1090 int16_t val = n * m; 1091 return DO_SQADD_H(val, val); 1092 } 1093 1094 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1095 { 1096 int32_t val = n * m; 1097 return DO_SQADD_S(val, val); 1098 } 1099 1100 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1101 { 1102 int64_t val = n * m; 1103 return do_sqadd_d(val, val); 1104 } 1105 1106 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1107 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1108 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1109 1110 #undef DO_ZZZ_TB 1111 1112 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1113 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1114 { \ 1115 intptr_t i, opr_sz = simd_oprsz(desc); \ 1116 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1117 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1118 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1119 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1120 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1121 } \ 1122 } 1123 1124 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1125 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1126 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1127 1128 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1129 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1130 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1131 1132 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1133 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1134 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1135 1136 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1137 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1138 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1139 1140 #undef DO_ZZZ_WTB 1141 1142 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1143 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1144 { \ 1145 intptr_t i, opr_sz = simd_oprsz(desc); \ 1146 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1147 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1148 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1149 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1150 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1151 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1152 } \ 1153 } 1154 1155 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1156 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1157 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1158 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1159 1160 #undef DO_ZZZ_NTB 1161 1162 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1163 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1164 { \ 1165 intptr_t i, opr_sz = simd_oprsz(desc); \ 1166 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1167 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1168 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1169 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1170 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1171 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1172 } \ 1173 } 1174 1175 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1176 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1177 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1178 1179 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1180 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1181 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1182 1183 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1184 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1185 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1186 1187 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1188 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1189 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1190 1191 #define DO_NMUL(N, M) -(N * M) 1192 1193 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1196 1197 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1200 1201 #undef DO_ZZZW_ACC 1202 1203 #define DO_XTNB(NAME, TYPE, OP) \ 1204 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1205 { \ 1206 intptr_t i, opr_sz = simd_oprsz(desc); \ 1207 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1208 TYPE nn = *(TYPE *)(vn + i); \ 1209 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1210 *(TYPE *)(vd + i) = nn; \ 1211 } \ 1212 } 1213 1214 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1215 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1216 { \ 1217 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1218 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1219 TYPE nn = *(TYPE *)(vn + i); \ 1220 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1221 } \ 1222 } 1223 1224 DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b) 1225 DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h) 1226 DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s) 1227 1228 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b) 1229 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h) 1230 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s) 1231 1232 DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b) 1233 DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h) 1234 DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s) 1235 1236 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b) 1237 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h) 1238 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s) 1239 1240 DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b) 1241 DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h) 1242 DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s) 1243 1244 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b) 1245 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h) 1246 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s) 1247 1248 #undef DO_XTNB 1249 #undef DO_XTNT 1250 1251 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1252 { 1253 intptr_t i, opr_sz = simd_oprsz(desc); 1254 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1255 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1256 uint32_t *a = va, *n = vn; 1257 uint64_t *d = vd, *m = vm; 1258 1259 for (i = 0; i < opr_sz / 8; ++i) { 1260 uint32_t e1 = a[2 * i + H4(0)]; 1261 uint32_t e2 = n[2 * i + sel] ^ inv; 1262 uint64_t c = extract64(m[i], 32, 1); 1263 /* Compute and store the entire 33-bit result at once. */ 1264 d[i] = c + e1 + e2; 1265 } 1266 } 1267 1268 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1269 { 1270 intptr_t i, opr_sz = simd_oprsz(desc); 1271 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1272 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1273 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1274 1275 for (i = 0; i < opr_sz / 8; i += 2) { 1276 Int128 e1 = int128_make64(a[i]); 1277 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1278 Int128 c = int128_make64(m[i + 1] & 1); 1279 Int128 r = int128_add(int128_add(e1, e2), c); 1280 d[i + 0] = int128_getlo(r); 1281 d[i + 1] = int128_gethi(r); 1282 } 1283 } 1284 1285 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1287 { \ 1288 intptr_t i, opr_sz = simd_oprsz(desc); \ 1289 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1290 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1291 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1292 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1293 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1294 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1295 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1296 } \ 1297 } 1298 1299 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1300 do_sqdmull_h, DO_SQADD_H) 1301 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1302 do_sqdmull_s, DO_SQADD_S) 1303 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1304 do_sqdmull_d, do_sqadd_d) 1305 1306 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1307 do_sqdmull_h, DO_SQSUB_H) 1308 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1309 do_sqdmull_s, DO_SQSUB_S) 1310 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1311 do_sqdmull_d, do_sqsub_d) 1312 1313 #undef DO_SQDMLAL 1314 1315 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1316 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1317 { \ 1318 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1319 int rot = simd_data(desc); \ 1320 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1321 bool sub_r = rot == 1 || rot == 2; \ 1322 bool sub_i = rot >= 2; \ 1323 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1324 for (i = 0; i < opr_sz; i += 2) { \ 1325 TYPE elt1_a = n[H(i + sel_a)]; \ 1326 TYPE elt2_a = m[H(i + sel_a)]; \ 1327 TYPE elt2_b = m[H(i + sel_b)]; \ 1328 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1329 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1330 } \ 1331 } 1332 1333 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1334 1335 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1336 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1337 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1338 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1339 1340 #define DO_SQRDMLAH_B(N, M, A, S) \ 1341 do_sqrdmlah_b(N, M, A, S, true) 1342 #define DO_SQRDMLAH_H(N, M, A, S) \ 1343 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1344 #define DO_SQRDMLAH_S(N, M, A, S) \ 1345 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1346 #define DO_SQRDMLAH_D(N, M, A, S) \ 1347 do_sqrdmlah_d(N, M, A, S, true) 1348 1349 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1350 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1351 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1352 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1353 1354 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1355 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1356 { \ 1357 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1358 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1359 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1360 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1361 bool sub_r = rot == 1 || rot == 2; \ 1362 bool sub_i = rot >= 2; \ 1363 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1364 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1365 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1366 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1367 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1368 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1369 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1370 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1371 } \ 1372 } \ 1373 } 1374 1375 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1376 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1377 1378 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1379 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1380 1381 #undef DO_CMLA 1382 #undef DO_CMLA_FUNC 1383 #undef DO_CMLA_IDX_FUNC 1384 #undef DO_SQRDMLAH_B 1385 #undef DO_SQRDMLAH_H 1386 #undef DO_SQRDMLAH_S 1387 #undef DO_SQRDMLAH_D 1388 1389 /* Note N and M are 4 elements bundled into one unit. */ 1390 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1391 int sel_a, int sel_b, int sub_i) 1392 { 1393 for (int i = 0; i <= 1; i++) { 1394 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1395 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1396 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1397 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1398 1399 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1400 } 1401 return a; 1402 } 1403 1404 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1405 int sel_a, int sel_b, int sub_i) 1406 { 1407 for (int i = 0; i <= 1; i++) { 1408 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1409 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1410 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1411 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1412 1413 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1414 } 1415 return a; 1416 } 1417 1418 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1419 void *va, uint32_t desc) 1420 { 1421 int opr_sz = simd_oprsz(desc); 1422 int rot = simd_data(desc); 1423 int sel_a = rot & 1; 1424 int sel_b = sel_a ^ 1; 1425 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1426 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1427 1428 for (int e = 0; e < opr_sz / 4; e++) { 1429 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1430 } 1431 } 1432 1433 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1434 void *va, uint32_t desc) 1435 { 1436 int opr_sz = simd_oprsz(desc); 1437 int rot = simd_data(desc); 1438 int sel_a = rot & 1; 1439 int sel_b = sel_a ^ 1; 1440 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1441 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1442 1443 for (int e = 0; e < opr_sz / 8; e++) { 1444 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1445 } 1446 } 1447 1448 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1449 void *va, uint32_t desc) 1450 { 1451 int opr_sz = simd_oprsz(desc); 1452 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1453 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1454 int sel_a = rot & 1; 1455 int sel_b = sel_a ^ 1; 1456 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1457 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1458 1459 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1460 uint32_t seg_m = m[seg + idx]; 1461 for (int e = 0; e < 4; e++) { 1462 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1463 sel_a, sel_b, sub_i); 1464 } 1465 } 1466 } 1467 1468 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1469 void *va, uint32_t desc) 1470 { 1471 int seg, opr_sz = simd_oprsz(desc); 1472 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1473 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1474 int sel_a = rot & 1; 1475 int sel_b = sel_a ^ 1; 1476 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1477 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1478 1479 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1480 uint64_t seg_m = m[seg + idx]; 1481 for (int e = 0; e < 2; e++) { 1482 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1483 sel_a, sel_b, sub_i); 1484 } 1485 } 1486 } 1487 1488 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1489 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1490 { \ 1491 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1492 intptr_t i, j, idx = simd_data(desc); \ 1493 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1494 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1495 TYPE mm = m[i]; \ 1496 for (j = 0; j < segment; j++) { \ 1497 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1498 } \ 1499 } \ 1500 } 1501 1502 #define DO_SQRDMLAH_H(N, M, A) \ 1503 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1504 #define DO_SQRDMLAH_S(N, M, A) \ 1505 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1506 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1507 1508 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1509 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1510 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1511 1512 #define DO_SQRDMLSH_H(N, M, A) \ 1513 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1514 #define DO_SQRDMLSH_S(N, M, A) \ 1515 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1516 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1517 1518 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1519 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1520 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1521 1522 #undef DO_ZZXZ 1523 1524 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1525 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1526 { \ 1527 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1528 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1529 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1530 for (i = 0; i < oprsz; i += 16) { \ 1531 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1532 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1533 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1534 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1535 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1536 } \ 1537 } \ 1538 } 1539 1540 #define DO_MLA(N, M, A) (A + N * M) 1541 1542 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1543 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1544 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1545 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1546 1547 #define DO_MLS(N, M, A) (A - N * M) 1548 1549 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1550 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1551 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1552 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1553 1554 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1555 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1556 1557 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1558 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1559 1560 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1561 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1562 1563 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1564 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1565 1566 #undef DO_MLA 1567 #undef DO_MLS 1568 #undef DO_ZZXW 1569 1570 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1571 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1572 { \ 1573 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1574 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1575 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1576 for (i = 0; i < oprsz; i += 16) { \ 1577 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1578 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1579 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1580 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1581 } \ 1582 } \ 1583 } 1584 1585 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1586 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1587 1588 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1589 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1590 1591 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1592 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1593 1594 #undef DO_ZZX 1595 1596 #define DO_BITPERM(NAME, TYPE, OP) \ 1597 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1598 { \ 1599 intptr_t i, opr_sz = simd_oprsz(desc); \ 1600 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1601 TYPE nn = *(TYPE *)(vn + i); \ 1602 TYPE mm = *(TYPE *)(vm + i); \ 1603 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1604 } \ 1605 } 1606 1607 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1608 { 1609 uint64_t res = 0; 1610 int db, rb = 0; 1611 1612 for (db = 0; db < n; ++db) { 1613 if ((mask >> db) & 1) { 1614 res |= ((data >> db) & 1) << rb; 1615 ++rb; 1616 } 1617 } 1618 return res; 1619 } 1620 1621 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1622 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1623 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1624 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1625 1626 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1627 { 1628 uint64_t res = 0; 1629 int rb, db = 0; 1630 1631 for (rb = 0; rb < n; ++rb) { 1632 if ((mask >> rb) & 1) { 1633 res |= ((data >> db) & 1) << rb; 1634 ++db; 1635 } 1636 } 1637 return res; 1638 } 1639 1640 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1641 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1642 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1643 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1644 1645 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1646 { 1647 uint64_t resm = 0, resu = 0; 1648 int db, rbm = 0, rbu = 0; 1649 1650 for (db = 0; db < n; ++db) { 1651 uint64_t val = (data >> db) & 1; 1652 if ((mask >> db) & 1) { 1653 resm |= val << rbm++; 1654 } else { 1655 resu |= val << rbu++; 1656 } 1657 } 1658 1659 return resm | (resu << rbm); 1660 } 1661 1662 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1663 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1664 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1665 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1666 1667 #undef DO_BITPERM 1668 1669 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1670 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1671 { \ 1672 intptr_t i, opr_sz = simd_oprsz(desc); \ 1673 int sub_r = simd_data(desc); \ 1674 if (sub_r) { \ 1675 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1676 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1677 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1678 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1679 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1680 acc_r = ADD_OP(acc_r, el2_i); \ 1681 acc_i = SUB_OP(acc_i, el2_r); \ 1682 *(TYPE *)(vd + H(i)) = acc_r; \ 1683 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1684 } \ 1685 } else { \ 1686 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1687 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1688 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1689 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1690 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1691 acc_r = SUB_OP(acc_r, el2_i); \ 1692 acc_i = ADD_OP(acc_i, el2_r); \ 1693 *(TYPE *)(vd + H(i)) = acc_r; \ 1694 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1695 } \ 1696 } \ 1697 } 1698 1699 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1700 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1701 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1702 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1703 1704 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1705 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1706 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1707 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1708 1709 #undef DO_CADD 1710 1711 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1712 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1713 { \ 1714 intptr_t i, opr_sz = simd_oprsz(desc); \ 1715 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1716 int shift = simd_data(desc) >> 1; \ 1717 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1718 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1719 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1720 } \ 1721 } 1722 1723 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1724 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1725 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1726 1727 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1728 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1729 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1730 1731 #undef DO_ZZI_SHLL 1732 1733 /* Two-operand reduction expander, controlled by a predicate. 1734 * The difference between TYPERED and TYPERET has to do with 1735 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1736 * but TYPERET must be unsigned so that e.g. a 32-bit value 1737 * is not sign-extended to the ABI uint64_t return type. 1738 */ 1739 /* ??? If we were to vectorize this by hand the reduction ordering 1740 * would change. For integer operands, this is perfectly fine. 1741 */ 1742 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1743 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1744 { \ 1745 intptr_t i, opr_sz = simd_oprsz(desc); \ 1746 TYPERED ret = INIT; \ 1747 for (i = 0; i < opr_sz; ) { \ 1748 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1749 do { \ 1750 if (pg & 1) { \ 1751 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1752 ret = OP(ret, nn); \ 1753 } \ 1754 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1755 } while (i & 15); \ 1756 } \ 1757 return (TYPERET)ret; \ 1758 } 1759 1760 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1761 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1762 { \ 1763 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1764 TYPEE *n = vn; \ 1765 uint8_t *pg = vg; \ 1766 TYPER ret = INIT; \ 1767 for (i = 0; i < opr_sz; i += 1) { \ 1768 if (pg[H1(i)] & 1) { \ 1769 TYPEE nn = n[i]; \ 1770 ret = OP(ret, nn); \ 1771 } \ 1772 } \ 1773 return ret; \ 1774 } 1775 1776 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1777 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1778 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1779 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1780 1781 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1782 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1783 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1784 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1785 1786 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1787 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1788 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1789 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1790 1791 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1792 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1793 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1794 1795 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1796 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1797 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1798 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1799 1800 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1801 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1802 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1803 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1804 1805 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1806 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1807 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1808 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1809 1810 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1811 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1812 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1813 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1814 1815 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1816 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1817 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1818 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1819 1820 #undef DO_VPZ 1821 #undef DO_VPZ_D 1822 1823 #define DO_VPQ(NAME, TYPE, H, INIT, OP) \ 1824 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 1825 { \ 1826 TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \ 1827 TYPE *n = vn; uint16_t *g = vg; \ 1828 uintptr_t oprsz = simd_oprsz(desc); \ 1829 uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE); \ 1830 for (uintptr_t s = 0; s < nseg; s++) { \ 1831 uint16_t pg = g[H2(s)]; \ 1832 for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) { \ 1833 if (pg & 1) { \ 1834 tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]); \ 1835 } \ 1836 } \ 1837 } \ 1838 memcpy(vd, tmp, 16); \ 1839 clear_tail(vd, 16, simd_maxsz(desc)); \ 1840 } 1841 1842 DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD) 1843 DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD) 1844 DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD) 1845 DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD) 1846 1847 DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX) 1848 DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX) 1849 DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX) 1850 DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX) 1851 1852 DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN) 1853 DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN) 1854 DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN) 1855 DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN) 1856 1857 DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX) 1858 DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX) 1859 DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX) 1860 DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX) 1861 1862 DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN) 1863 DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN) 1864 DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN) 1865 DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN) 1866 1867 #undef DO_VPQ 1868 1869 /* Two vector operand, one scalar operand, unpredicated. */ 1870 #define DO_ZZI(NAME, TYPE, OP) \ 1871 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1872 { \ 1873 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1874 TYPE s = s64, *d = vd, *n = vn; \ 1875 for (i = 0; i < opr_sz; ++i) { \ 1876 d[i] = OP(n[i], s); \ 1877 } \ 1878 } 1879 1880 #define DO_SUBR(X, Y) (Y - X) 1881 1882 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1883 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1884 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1885 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1886 1887 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1888 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1889 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1890 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1891 1892 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1893 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1894 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1895 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1896 1897 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1898 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1899 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1900 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1901 1902 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1903 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1904 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1905 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1906 1907 #undef DO_ZZI 1908 1909 #define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP) \ 1910 void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \ 1911 { \ 1912 unsigned seg = simd_oprsz(desc) / 16; \ 1913 uint64_t r0 = INIT, r1 = INIT; \ 1914 for (unsigned s = 0; s < seg; s++) { \ 1915 uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \ 1916 uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \ 1917 uint64_t v0 = *(uint64_t *)(vn + s * 16); \ 1918 uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8); \ 1919 v0 = POP(v0, p0), v1 = POP(v1, p1); \ 1920 r0 = VOP(r0, v0), r1 = VOP(r1, v1); \ 1921 } \ 1922 *(uint64_t *)(vd + 0) = r0; \ 1923 *(uint64_t *)(vd + 8) = r1; \ 1924 clear_tail(vd, 16, simd_maxsz(desc)); \ 1925 } 1926 1927 DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND) 1928 DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND) 1929 DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND) 1930 DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND) 1931 1932 DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND) 1933 DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND) 1934 DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND) 1935 DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND) 1936 1937 DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC) 1938 DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC) 1939 DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC) 1940 DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC) 1941 1942 #undef DO_LOGIC_QV 1943 1944 #undef DO_AND 1945 #undef DO_ORR 1946 #undef DO_EOR 1947 #undef DO_BIC 1948 #undef DO_ORC 1949 #undef DO_ADD 1950 #undef DO_SUB 1951 #undef DO_MAX 1952 #undef DO_MIN 1953 #undef DO_ABD 1954 #undef DO_MUL 1955 #undef DO_DIV 1956 #undef DO_ASR 1957 #undef DO_LSR 1958 #undef DO_LSL 1959 #undef DO_SUBR 1960 1961 /* Similar to the ARM LastActiveElement pseudocode function, except the 1962 result is multiplied by the element size. This includes the not found 1963 indication; e.g. not found for esz=3 is -8. */ 1964 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1965 { 1966 uint64_t mask = pred_esz_masks[esz]; 1967 intptr_t i = words; 1968 1969 do { 1970 uint64_t this_g = g[--i] & mask; 1971 if (this_g) { 1972 return i * 64 + (63 - clz64(this_g)); 1973 } 1974 } while (i > 0); 1975 return (intptr_t)-1 << esz; 1976 } 1977 1978 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1979 { 1980 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1981 uint32_t flags = PREDTEST_INIT; 1982 uint64_t *d = vd, *g = vg; 1983 intptr_t i = 0; 1984 1985 do { 1986 uint64_t this_d = d[i]; 1987 uint64_t this_g = g[i]; 1988 1989 if (this_g) { 1990 if (!(flags & 4)) { 1991 /* Set in D the first bit of G. */ 1992 this_d |= this_g & -this_g; 1993 d[i] = this_d; 1994 } 1995 flags = iter_predtest_fwd(this_d, this_g, flags); 1996 } 1997 } while (++i < words); 1998 1999 return flags; 2000 } 2001 2002 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 2003 { 2004 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 2005 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 2006 uint32_t flags = PREDTEST_INIT; 2007 uint64_t *d = vd, *g = vg, esz_mask; 2008 intptr_t i, next; 2009 2010 next = last_active_element(vd, words, esz) + (1 << esz); 2011 esz_mask = pred_esz_masks[esz]; 2012 2013 /* Similar to the pseudocode for pnext, but scaled by ESZ 2014 so that we find the correct bit. */ 2015 if (next < words * 64) { 2016 uint64_t mask = -1; 2017 2018 if (next & 63) { 2019 mask = ~((1ull << (next & 63)) - 1); 2020 next &= -64; 2021 } 2022 do { 2023 uint64_t this_g = g[next / 64] & esz_mask & mask; 2024 if (this_g != 0) { 2025 next = (next & -64) + ctz64(this_g); 2026 break; 2027 } 2028 next += 64; 2029 mask = -1; 2030 } while (next < words * 64); 2031 } 2032 2033 i = 0; 2034 do { 2035 uint64_t this_d = 0; 2036 if (i == next / 64) { 2037 this_d = 1ull << (next & 63); 2038 } 2039 d[i] = this_d; 2040 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 2041 } while (++i < words); 2042 2043 return flags; 2044 } 2045 2046 /* 2047 * Copy Zn into Zd, and store zero into inactive elements. 2048 * If inv, store zeros into the active elements. 2049 */ 2050 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 2051 { 2052 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2053 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2054 uint64_t *d = vd, *n = vn; 2055 uint8_t *pg = vg; 2056 2057 for (i = 0; i < opr_sz; i += 1) { 2058 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 2059 } 2060 } 2061 2062 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 2063 { 2064 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2065 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2066 uint64_t *d = vd, *n = vn; 2067 uint8_t *pg = vg; 2068 2069 for (i = 0; i < opr_sz; i += 1) { 2070 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 2071 } 2072 } 2073 2074 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 2075 { 2076 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2077 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2078 uint64_t *d = vd, *n = vn; 2079 uint8_t *pg = vg; 2080 2081 for (i = 0; i < opr_sz; i += 1) { 2082 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 2083 } 2084 } 2085 2086 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 2087 { 2088 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2089 uint64_t *d = vd, *n = vn; 2090 uint8_t *pg = vg; 2091 uint8_t inv = simd_data(desc); 2092 2093 for (i = 0; i < opr_sz; i += 1) { 2094 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2095 } 2096 } 2097 2098 /* Three-operand expander, immediate operand, controlled by a predicate. 2099 */ 2100 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2101 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2102 { \ 2103 intptr_t i, opr_sz = simd_oprsz(desc); \ 2104 TYPE imm = simd_data(desc); \ 2105 for (i = 0; i < opr_sz; ) { \ 2106 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2107 do { \ 2108 if (pg & 1) { \ 2109 TYPE nn = *(TYPE *)(vn + H(i)); \ 2110 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2111 } \ 2112 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2113 } while (i & 15); \ 2114 } \ 2115 } 2116 2117 /* Similarly, specialized for 64-bit operands. */ 2118 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2119 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2120 { \ 2121 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2122 TYPE *d = vd, *n = vn; \ 2123 TYPE imm = simd_data(desc); \ 2124 uint8_t *pg = vg; \ 2125 for (i = 0; i < opr_sz; i += 1) { \ 2126 if (pg[H1(i)] & 1) { \ 2127 TYPE nn = n[i]; \ 2128 d[i] = OP(nn, imm); \ 2129 } \ 2130 } \ 2131 } 2132 2133 #define DO_SHR(N, M) (N >> M) 2134 #define DO_SHL(N, M) (N << M) 2135 2136 /* Arithmetic shift right for division. This rounds negative numbers 2137 toward zero as per signed division. Therefore before shifting, 2138 when N is negative, add 2**M-1. */ 2139 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2140 2141 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2142 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2143 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2144 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2145 2146 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2147 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2148 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2149 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2150 2151 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2152 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2153 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2154 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2155 2156 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2157 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2158 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2159 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2160 2161 /* SVE2 bitwise shift by immediate */ 2162 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2163 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2164 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2165 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2166 2167 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2168 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2169 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2170 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2171 2172 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2173 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2174 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2175 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2176 2177 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2178 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2179 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2180 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2181 2182 #define do_suqrshl_b(n, m) \ 2183 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2184 #define do_suqrshl_h(n, m) \ 2185 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2186 #define do_suqrshl_s(n, m) \ 2187 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2188 #define do_suqrshl_d(n, m) \ 2189 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2190 2191 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2192 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2193 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2194 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2195 2196 #undef DO_ASRD 2197 #undef DO_ZPZI 2198 #undef DO_ZPZI_D 2199 2200 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2201 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2202 { \ 2203 intptr_t i, opr_sz = simd_oprsz(desc); \ 2204 int shift = simd_data(desc); \ 2205 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2206 TYPEW nn = *(TYPEW *)(vn + i); \ 2207 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2208 } \ 2209 } 2210 2211 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2212 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2213 { \ 2214 intptr_t i, opr_sz = simd_oprsz(desc); \ 2215 int shift = simd_data(desc); \ 2216 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2217 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2218 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2219 } \ 2220 } 2221 2222 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2223 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2224 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2225 2226 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2227 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2228 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2229 2230 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2231 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2232 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2233 2234 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2235 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2236 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2237 2238 #define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh) 2239 #define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh) 2240 #define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63)) 2241 2242 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2243 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2244 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2245 2246 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2247 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2248 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2249 2250 #define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh)) 2251 #define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh)) 2252 #define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh)) 2253 2254 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2255 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2256 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2257 2258 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2259 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2260 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2261 2262 #define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh) 2263 #define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh) 2264 #define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh) 2265 2266 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2267 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2268 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2269 2270 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2271 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2272 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2273 2274 #define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh)) 2275 #define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh)) 2276 #define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh)) 2277 2278 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2279 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2280 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2281 2282 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2283 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2284 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2285 2286 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2287 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2288 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2289 2290 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2291 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2292 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2293 2294 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2295 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2296 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2297 2298 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2299 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2300 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2301 2302 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2303 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2304 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2305 2306 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2307 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2308 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2309 2310 #undef DO_SHRNB 2311 #undef DO_SHRNT 2312 2313 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2314 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2315 { \ 2316 intptr_t i, opr_sz = simd_oprsz(desc); \ 2317 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2318 TYPEW nn = *(TYPEW *)(vn + i); \ 2319 TYPEW mm = *(TYPEW *)(vm + i); \ 2320 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2321 } \ 2322 } 2323 2324 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2325 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2326 { \ 2327 intptr_t i, opr_sz = simd_oprsz(desc); \ 2328 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2329 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2330 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2331 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2332 } \ 2333 } 2334 2335 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2336 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2337 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2338 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2339 2340 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2341 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2342 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2343 2344 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2345 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2346 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2347 2348 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2349 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2350 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2351 2352 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2353 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2354 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2355 2356 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2357 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2358 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2359 2360 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2361 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2362 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2363 2364 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2365 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2366 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2367 2368 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2369 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2370 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2371 2372 #undef DO_RSUBHN 2373 #undef DO_SUBHN 2374 #undef DO_RADDHN 2375 #undef DO_ADDHN 2376 2377 #undef DO_BINOPNB 2378 2379 /* Fully general four-operand expander, controlled by a predicate. 2380 */ 2381 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2382 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2383 void *vg, uint32_t desc) \ 2384 { \ 2385 intptr_t i, opr_sz = simd_oprsz(desc); \ 2386 for (i = 0; i < opr_sz; ) { \ 2387 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2388 do { \ 2389 if (pg & 1) { \ 2390 TYPE nn = *(TYPE *)(vn + H(i)); \ 2391 TYPE mm = *(TYPE *)(vm + H(i)); \ 2392 TYPE aa = *(TYPE *)(va + H(i)); \ 2393 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2394 } \ 2395 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2396 } while (i & 15); \ 2397 } \ 2398 } 2399 2400 /* Similarly, specialized for 64-bit operands. */ 2401 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2402 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2403 void *vg, uint32_t desc) \ 2404 { \ 2405 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2406 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2407 uint8_t *pg = vg; \ 2408 for (i = 0; i < opr_sz; i += 1) { \ 2409 if (pg[H1(i)] & 1) { \ 2410 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2411 d[i] = OP(aa, nn, mm); \ 2412 } \ 2413 } \ 2414 } 2415 2416 #define DO_MLA(A, N, M) (A + N * M) 2417 #define DO_MLS(A, N, M) (A - N * M) 2418 2419 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2420 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2421 2422 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2423 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2424 2425 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2426 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2427 2428 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2429 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2430 2431 #undef DO_MLA 2432 #undef DO_MLS 2433 #undef DO_ZPZZZ 2434 #undef DO_ZPZZZ_D 2435 2436 void HELPER(sve_index_b)(void *vd, uint32_t start, 2437 uint32_t incr, uint32_t desc) 2438 { 2439 intptr_t i, opr_sz = simd_oprsz(desc); 2440 uint8_t *d = vd; 2441 for (i = 0; i < opr_sz; i += 1) { 2442 d[H1(i)] = start + i * incr; 2443 } 2444 } 2445 2446 void HELPER(sve_index_h)(void *vd, uint32_t start, 2447 uint32_t incr, uint32_t desc) 2448 { 2449 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2450 uint16_t *d = vd; 2451 for (i = 0; i < opr_sz; i += 1) { 2452 d[H2(i)] = start + i * incr; 2453 } 2454 } 2455 2456 void HELPER(sve_index_s)(void *vd, uint32_t start, 2457 uint32_t incr, uint32_t desc) 2458 { 2459 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2460 uint32_t *d = vd; 2461 for (i = 0; i < opr_sz; i += 1) { 2462 d[H4(i)] = start + i * incr; 2463 } 2464 } 2465 2466 void HELPER(sve_index_d)(void *vd, uint64_t start, 2467 uint64_t incr, uint32_t desc) 2468 { 2469 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2470 uint64_t *d = vd; 2471 for (i = 0; i < opr_sz; i += 1) { 2472 d[i] = start + i * incr; 2473 } 2474 } 2475 2476 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2477 { 2478 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2479 uint32_t sh = simd_data(desc); 2480 uint32_t *d = vd, *n = vn, *m = vm; 2481 for (i = 0; i < opr_sz; i += 1) { 2482 d[i] = n[i] + (m[i] << sh); 2483 } 2484 } 2485 2486 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2487 { 2488 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2489 uint64_t sh = simd_data(desc); 2490 uint64_t *d = vd, *n = vn, *m = vm; 2491 for (i = 0; i < opr_sz; i += 1) { 2492 d[i] = n[i] + (m[i] << sh); 2493 } 2494 } 2495 2496 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2497 { 2498 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2499 uint64_t sh = simd_data(desc); 2500 uint64_t *d = vd, *n = vn, *m = vm; 2501 for (i = 0; i < opr_sz; i += 1) { 2502 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2503 } 2504 } 2505 2506 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2507 { 2508 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2509 uint64_t sh = simd_data(desc); 2510 uint64_t *d = vd, *n = vn, *m = vm; 2511 for (i = 0; i < opr_sz; i += 1) { 2512 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2513 } 2514 } 2515 2516 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2517 { 2518 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2519 static const uint16_t coeff[] = { 2520 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2521 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2522 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2523 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2524 }; 2525 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2526 uint16_t *d = vd, *n = vn; 2527 2528 for (i = 0; i < opr_sz; i++) { 2529 uint16_t nn = n[i]; 2530 intptr_t idx = extract32(nn, 0, 5); 2531 uint16_t exp = extract32(nn, 5, 5); 2532 d[i] = coeff[idx] | (exp << 10); 2533 } 2534 } 2535 2536 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2537 { 2538 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2539 static const uint32_t coeff[] = { 2540 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2541 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2542 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2543 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2544 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2545 0x1ef532, 0x20b051, 0x227043, 0x243516, 2546 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2547 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2548 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2549 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2550 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2551 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2552 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2553 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2554 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2555 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2556 }; 2557 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2558 uint32_t *d = vd, *n = vn; 2559 2560 for (i = 0; i < opr_sz; i++) { 2561 uint32_t nn = n[i]; 2562 intptr_t idx = extract32(nn, 0, 6); 2563 uint32_t exp = extract32(nn, 6, 8); 2564 d[i] = coeff[idx] | (exp << 23); 2565 } 2566 } 2567 2568 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2569 { 2570 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2571 static const uint64_t coeff[] = { 2572 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2573 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2574 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2575 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2576 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2577 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2578 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2579 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2580 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2581 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2582 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2583 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2584 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2585 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2586 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2587 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2588 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2589 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2590 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2591 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2592 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2593 0xFA7C1819E90D8ull, 2594 }; 2595 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2596 uint64_t *d = vd, *n = vn; 2597 2598 for (i = 0; i < opr_sz; i++) { 2599 uint64_t nn = n[i]; 2600 intptr_t idx = extract32(nn, 0, 6); 2601 uint64_t exp = extract32(nn, 6, 11); 2602 d[i] = coeff[idx] | (exp << 52); 2603 } 2604 } 2605 2606 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2607 { 2608 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2609 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2610 uint16_t *d = vd, *n = vn, *m = vm; 2611 for (i = 0; i < opr_sz; i += 1) { 2612 uint16_t nn = n[i]; 2613 uint16_t mm = m[i]; 2614 if (mm & 1) { 2615 nn = float16_one; 2616 } 2617 if (mm & 2) { 2618 nn = float16_maybe_ah_chs(nn, fpcr_ah); 2619 } 2620 d[i] = nn; 2621 } 2622 } 2623 2624 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2625 { 2626 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2627 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2628 uint32_t *d = vd, *n = vn, *m = vm; 2629 for (i = 0; i < opr_sz; i += 1) { 2630 uint32_t nn = n[i]; 2631 uint32_t mm = m[i]; 2632 if (mm & 1) { 2633 nn = float32_one; 2634 } 2635 if (mm & 2) { 2636 nn = float32_maybe_ah_chs(nn, fpcr_ah); 2637 } 2638 d[i] = nn; 2639 } 2640 } 2641 2642 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2643 { 2644 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2645 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2646 uint64_t *d = vd, *n = vn, *m = vm; 2647 for (i = 0; i < opr_sz; i += 1) { 2648 uint64_t nn = n[i]; 2649 uint64_t mm = m[i]; 2650 if (mm & 1) { 2651 nn = float64_one; 2652 } 2653 if (mm & 2) { 2654 nn = float64_maybe_ah_chs(nn, fpcr_ah); 2655 } 2656 d[i] = nn; 2657 } 2658 } 2659 2660 /* 2661 * Signed saturating addition with scalar operand. 2662 */ 2663 2664 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2665 { 2666 intptr_t i, oprsz = simd_oprsz(desc); 2667 2668 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2669 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2670 } 2671 } 2672 2673 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2674 { 2675 intptr_t i, oprsz = simd_oprsz(desc); 2676 2677 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2678 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2679 } 2680 } 2681 2682 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2683 { 2684 intptr_t i, oprsz = simd_oprsz(desc); 2685 2686 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2687 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2688 } 2689 } 2690 2691 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2692 { 2693 intptr_t i, oprsz = simd_oprsz(desc); 2694 2695 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2696 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2697 } 2698 } 2699 2700 /* 2701 * Unsigned saturating addition with scalar operand. 2702 */ 2703 2704 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2705 { 2706 intptr_t i, oprsz = simd_oprsz(desc); 2707 2708 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2709 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2710 } 2711 } 2712 2713 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2714 { 2715 intptr_t i, oprsz = simd_oprsz(desc); 2716 2717 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2718 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2719 } 2720 } 2721 2722 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2723 { 2724 intptr_t i, oprsz = simd_oprsz(desc); 2725 2726 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2727 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2728 } 2729 } 2730 2731 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2732 { 2733 intptr_t i, oprsz = simd_oprsz(desc); 2734 2735 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2736 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2737 } 2738 } 2739 2740 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2741 { 2742 intptr_t i, oprsz = simd_oprsz(desc); 2743 2744 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2745 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2746 } 2747 } 2748 2749 /* Two operand predicated copy immediate with merge. All valid immediates 2750 * can fit within 17 signed bits in the simd_data field. 2751 */ 2752 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2753 uint64_t mm, uint32_t desc) 2754 { 2755 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2756 uint64_t *d = vd, *n = vn; 2757 uint8_t *pg = vg; 2758 2759 mm = dup_const(MO_8, mm); 2760 for (i = 0; i < opr_sz; i += 1) { 2761 uint64_t nn = n[i]; 2762 uint64_t pp = expand_pred_b(pg[H1(i)]); 2763 d[i] = (mm & pp) | (nn & ~pp); 2764 } 2765 } 2766 2767 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2768 uint64_t mm, uint32_t desc) 2769 { 2770 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2771 uint64_t *d = vd, *n = vn; 2772 uint8_t *pg = vg; 2773 2774 mm = dup_const(MO_16, mm); 2775 for (i = 0; i < opr_sz; i += 1) { 2776 uint64_t nn = n[i]; 2777 uint64_t pp = expand_pred_h(pg[H1(i)]); 2778 d[i] = (mm & pp) | (nn & ~pp); 2779 } 2780 } 2781 2782 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2783 uint64_t mm, uint32_t desc) 2784 { 2785 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2786 uint64_t *d = vd, *n = vn; 2787 uint8_t *pg = vg; 2788 2789 mm = dup_const(MO_32, mm); 2790 for (i = 0; i < opr_sz; i += 1) { 2791 uint64_t nn = n[i]; 2792 uint64_t pp = expand_pred_s(pg[H1(i)]); 2793 d[i] = (mm & pp) | (nn & ~pp); 2794 } 2795 } 2796 2797 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2798 uint64_t mm, uint32_t desc) 2799 { 2800 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2801 uint64_t *d = vd, *n = vn; 2802 uint8_t *pg = vg; 2803 2804 for (i = 0; i < opr_sz; i += 1) { 2805 uint64_t nn = n[i]; 2806 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2807 } 2808 } 2809 2810 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2811 { 2812 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2813 uint64_t *d = vd; 2814 uint8_t *pg = vg; 2815 2816 val = dup_const(MO_8, val); 2817 for (i = 0; i < opr_sz; i += 1) { 2818 d[i] = val & expand_pred_b(pg[H1(i)]); 2819 } 2820 } 2821 2822 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2823 { 2824 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2825 uint64_t *d = vd; 2826 uint8_t *pg = vg; 2827 2828 val = dup_const(MO_16, val); 2829 for (i = 0; i < opr_sz; i += 1) { 2830 d[i] = val & expand_pred_h(pg[H1(i)]); 2831 } 2832 } 2833 2834 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2835 { 2836 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2837 uint64_t *d = vd; 2838 uint8_t *pg = vg; 2839 2840 val = dup_const(MO_32, val); 2841 for (i = 0; i < opr_sz; i += 1) { 2842 d[i] = val & expand_pred_s(pg[H1(i)]); 2843 } 2844 } 2845 2846 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2847 { 2848 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2849 uint64_t *d = vd; 2850 uint8_t *pg = vg; 2851 2852 for (i = 0; i < opr_sz; i += 1) { 2853 d[i] = (pg[H1(i)] & 1 ? val : 0); 2854 } 2855 } 2856 2857 /* Big-endian hosts need to frob the byte indices. If the copy 2858 * happens to be 8-byte aligned, then no frobbing necessary. 2859 */ 2860 static void swap_memmove(void *vd, void *vs, size_t n) 2861 { 2862 uintptr_t d = (uintptr_t)vd; 2863 uintptr_t s = (uintptr_t)vs; 2864 uintptr_t o = (d | s | n) & 7; 2865 size_t i; 2866 2867 #if !HOST_BIG_ENDIAN 2868 o = 0; 2869 #endif 2870 switch (o) { 2871 case 0: 2872 memmove(vd, vs, n); 2873 break; 2874 2875 case 4: 2876 if (d < s || d >= s + n) { 2877 for (i = 0; i < n; i += 4) { 2878 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2879 } 2880 } else { 2881 for (i = n; i > 0; ) { 2882 i -= 4; 2883 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2884 } 2885 } 2886 break; 2887 2888 case 2: 2889 case 6: 2890 if (d < s || d >= s + n) { 2891 for (i = 0; i < n; i += 2) { 2892 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2893 } 2894 } else { 2895 for (i = n; i > 0; ) { 2896 i -= 2; 2897 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2898 } 2899 } 2900 break; 2901 2902 default: 2903 if (d < s || d >= s + n) { 2904 for (i = 0; i < n; i++) { 2905 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2906 } 2907 } else { 2908 for (i = n; i > 0; ) { 2909 i -= 1; 2910 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2911 } 2912 } 2913 break; 2914 } 2915 } 2916 2917 /* Similarly for memset of 0. */ 2918 static void swap_memzero(void *vd, size_t n) 2919 { 2920 uintptr_t d = (uintptr_t)vd; 2921 uintptr_t o = (d | n) & 7; 2922 size_t i; 2923 2924 /* Usually, the first bit of a predicate is set, so N is 0. */ 2925 if (likely(n == 0)) { 2926 return; 2927 } 2928 2929 #if !HOST_BIG_ENDIAN 2930 o = 0; 2931 #endif 2932 switch (o) { 2933 case 0: 2934 memset(vd, 0, n); 2935 break; 2936 2937 case 4: 2938 for (i = 0; i < n; i += 4) { 2939 *(uint32_t *)H1_4(d + i) = 0; 2940 } 2941 break; 2942 2943 case 2: 2944 case 6: 2945 for (i = 0; i < n; i += 2) { 2946 *(uint16_t *)H1_2(d + i) = 0; 2947 } 2948 break; 2949 2950 default: 2951 for (i = 0; i < n; i++) { 2952 *(uint8_t *)H1(d + i) = 0; 2953 } 2954 break; 2955 } 2956 } 2957 2958 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2959 { 2960 intptr_t opr_sz = simd_oprsz(desc); 2961 size_t n_ofs = simd_data(desc); 2962 size_t n_siz = opr_sz - n_ofs; 2963 2964 if (vd != vm) { 2965 swap_memmove(vd, vn + n_ofs, n_siz); 2966 swap_memmove(vd + n_siz, vm, n_ofs); 2967 } else if (vd != vn) { 2968 swap_memmove(vd + n_siz, vd, n_ofs); 2969 swap_memmove(vd, vn + n_ofs, n_siz); 2970 } else { 2971 /* vd == vn == vm. Need temp space. */ 2972 ARMVectorReg tmp; 2973 swap_memmove(&tmp, vm, n_ofs); 2974 swap_memmove(vd, vd + n_ofs, n_siz); 2975 memcpy(vd + n_siz, &tmp, n_ofs); 2976 } 2977 } 2978 2979 #define DO_INSR(NAME, TYPE, H) \ 2980 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2981 { \ 2982 intptr_t opr_sz = simd_oprsz(desc); \ 2983 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2984 *(TYPE *)(vd + H(0)) = val; \ 2985 } 2986 2987 DO_INSR(sve_insr_b, uint8_t, H1) 2988 DO_INSR(sve_insr_h, uint16_t, H1_2) 2989 DO_INSR(sve_insr_s, uint32_t, H1_4) 2990 DO_INSR(sve_insr_d, uint64_t, H1_8) 2991 2992 #undef DO_INSR 2993 2994 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2995 { 2996 intptr_t i, j, opr_sz = simd_oprsz(desc); 2997 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2998 uint64_t f = *(uint64_t *)(vn + i); 2999 uint64_t b = *(uint64_t *)(vn + j); 3000 *(uint64_t *)(vd + i) = bswap64(b); 3001 *(uint64_t *)(vd + j) = bswap64(f); 3002 } 3003 } 3004 3005 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 3006 { 3007 intptr_t i, j, opr_sz = simd_oprsz(desc); 3008 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3009 uint64_t f = *(uint64_t *)(vn + i); 3010 uint64_t b = *(uint64_t *)(vn + j); 3011 *(uint64_t *)(vd + i) = hswap64(b); 3012 *(uint64_t *)(vd + j) = hswap64(f); 3013 } 3014 } 3015 3016 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 3017 { 3018 intptr_t i, j, opr_sz = simd_oprsz(desc); 3019 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3020 uint64_t f = *(uint64_t *)(vn + i); 3021 uint64_t b = *(uint64_t *)(vn + j); 3022 *(uint64_t *)(vd + i) = rol64(b, 32); 3023 *(uint64_t *)(vd + j) = rol64(f, 32); 3024 } 3025 } 3026 3027 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 3028 { 3029 intptr_t i, j, opr_sz = simd_oprsz(desc); 3030 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 3031 uint64_t f = *(uint64_t *)(vn + i); 3032 uint64_t b = *(uint64_t *)(vn + j); 3033 *(uint64_t *)(vd + i) = b; 3034 *(uint64_t *)(vd + j) = f; 3035 } 3036 } 3037 3038 /* 3039 * TODO: This could use half_shuffle64 and similar bit tricks to 3040 * expand blocks of bits at once. 3041 */ 3042 #define DO_PMOV_PV(NAME, ESIZE) \ 3043 void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ 3044 { \ 3045 unsigned vl = simd_oprsz(desc); \ 3046 unsigned idx = simd_data(desc); \ 3047 unsigned elements = vl / ESIZE; \ 3048 ARMPredicateReg *d = vd; \ 3049 ARMVectorReg *s = vs; \ 3050 memset(d, 0, sizeof(*d)); \ 3051 for (unsigned e = 0; e < elements; ++e) { \ 3052 depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \ 3053 } \ 3054 } 3055 3056 DO_PMOV_PV(pmov_pv_h, 2) 3057 DO_PMOV_PV(pmov_pv_s, 4) 3058 DO_PMOV_PV(pmov_pv_d, 8) 3059 3060 #undef DO_PMOV_PV 3061 3062 /* 3063 * TODO: This could use half_unshuffle64 and similar bit tricks to 3064 * compress blocks of bits at once. 3065 */ 3066 #define DO_PMOV_VP(NAME, ESIZE) \ 3067 void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \ 3068 { \ 3069 unsigned vl = simd_oprsz(desc); \ 3070 unsigned idx = simd_data(desc); \ 3071 unsigned elements = vl / ESIZE; \ 3072 ARMVectorReg *d = vd; \ 3073 ARMPredicateReg *s = vs; \ 3074 if (idx == 0) { \ 3075 memset(d, 0, vl); \ 3076 } \ 3077 for (unsigned e = 0; e < elements; ++e) { \ 3078 depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \ 3079 } \ 3080 } 3081 3082 DO_PMOV_VP(pmov_vp_h, 2) 3083 DO_PMOV_VP(pmov_vp_s, 4) 3084 DO_PMOV_VP(pmov_vp_d, 8) 3085 3086 #undef DO_PMOV_VP 3087 3088 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 3089 3090 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 3091 bool is_tbx, tb_impl_fn *fn) 3092 { 3093 ARMVectorReg scratch; 3094 uintptr_t oprsz = simd_oprsz(desc); 3095 3096 if (unlikely(vd == vn)) { 3097 vn = memcpy(&scratch, vn, oprsz); 3098 } 3099 3100 fn(vd, vn, NULL, vm, oprsz, is_tbx); 3101 } 3102 3103 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 3104 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 3105 { 3106 ARMVectorReg scratch; 3107 uintptr_t oprsz = simd_oprsz(desc); 3108 3109 if (unlikely(vd == vn0)) { 3110 vn0 = memcpy(&scratch, vn0, oprsz); 3111 if (vd == vn1) { 3112 vn1 = vn0; 3113 } 3114 } else if (unlikely(vd == vn1)) { 3115 vn1 = memcpy(&scratch, vn1, oprsz); 3116 } 3117 3118 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 3119 } 3120 3121 #define DO_TB(SUFF, TYPE, H) \ 3122 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 3123 void *vm, uintptr_t oprsz, bool is_tbx) \ 3124 { \ 3125 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 3126 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 3127 for (i = 0; i < nelem; ++i) { \ 3128 TYPE index = indexes[H1(i)], val = 0; \ 3129 if (index < nelem) { \ 3130 val = tbl0[H(index)]; \ 3131 } else { \ 3132 index -= nelem; \ 3133 if (tbl1 && index < nelem) { \ 3134 val = tbl1[H(index)]; \ 3135 } else if (is_tbx) { \ 3136 continue; \ 3137 } \ 3138 } \ 3139 d[H(i)] = val; \ 3140 } \ 3141 } \ 3142 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3143 { \ 3144 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3145 } \ 3146 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3147 void *vm, uint32_t desc) \ 3148 { \ 3149 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3150 } \ 3151 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3152 { \ 3153 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3154 } 3155 3156 DO_TB(b, uint8_t, H1) 3157 DO_TB(h, uint16_t, H2) 3158 DO_TB(s, uint32_t, H4) 3159 DO_TB(d, uint64_t, H8) 3160 3161 #undef DO_TB 3162 3163 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3164 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3165 { \ 3166 intptr_t i, opr_sz = simd_oprsz(desc); \ 3167 TYPED *d = vd; \ 3168 TYPES *n = vn; \ 3169 ARMVectorReg tmp; \ 3170 if (unlikely(vn - vd < opr_sz)) { \ 3171 n = memcpy(&tmp, n, opr_sz / 2); \ 3172 } \ 3173 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3174 d[HD(i)] = n[HS(i)]; \ 3175 } \ 3176 } 3177 3178 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3179 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3180 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3181 3182 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3183 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3184 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3185 3186 #undef DO_UNPK 3187 3188 /* Mask of bits included in the even numbered predicates of width esz. 3189 * We also use this for expand_bits/compress_bits, and so extend the 3190 * same pattern out to 16-bit units. 3191 */ 3192 static const uint64_t even_bit_esz_masks[5] = { 3193 0x5555555555555555ull, 3194 0x3333333333333333ull, 3195 0x0f0f0f0f0f0f0f0full, 3196 0x00ff00ff00ff00ffull, 3197 0x0000ffff0000ffffull, 3198 }; 3199 3200 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3201 * For N==0, this corresponds to the operation that in qemu/bitops.h 3202 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3203 * section 7-2 Shuffling Bits. 3204 */ 3205 static uint64_t expand_bits(uint64_t x, int n) 3206 { 3207 int i; 3208 3209 x &= 0xffffffffu; 3210 for (i = 4; i >= n; i--) { 3211 int sh = 1 << i; 3212 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3213 } 3214 return x; 3215 } 3216 3217 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3218 * For N==0, this corresponds to the operation that in qemu/bitops.h 3219 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3220 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3221 */ 3222 static uint64_t compress_bits(uint64_t x, int n) 3223 { 3224 int i; 3225 3226 for (i = n; i <= 4; i++) { 3227 int sh = 1 << i; 3228 x &= even_bit_esz_masks[i]; 3229 x = (x >> sh) | x; 3230 } 3231 return x & 0xffffffffu; 3232 } 3233 3234 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3235 { 3236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3237 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3238 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3239 int esize = 1 << esz; 3240 uint64_t *d = vd; 3241 intptr_t i; 3242 3243 if (oprsz <= 8) { 3244 uint64_t nn = *(uint64_t *)vn; 3245 uint64_t mm = *(uint64_t *)vm; 3246 int half = 4 * oprsz; 3247 3248 nn = extract64(nn, high * half, half); 3249 mm = extract64(mm, high * half, half); 3250 nn = expand_bits(nn, esz); 3251 mm = expand_bits(mm, esz); 3252 d[0] = nn | (mm << esize); 3253 } else { 3254 ARMPredicateReg tmp; 3255 3256 /* We produce output faster than we consume input. 3257 Therefore we must be mindful of possible overlap. */ 3258 if (vd == vn) { 3259 vn = memcpy(&tmp, vn, oprsz); 3260 if (vd == vm) { 3261 vm = vn; 3262 } 3263 } else if (vd == vm) { 3264 vm = memcpy(&tmp, vm, oprsz); 3265 } 3266 if (high) { 3267 high = oprsz >> 1; 3268 } 3269 3270 if ((oprsz & 7) == 0) { 3271 uint32_t *n = vn, *m = vm; 3272 high >>= 2; 3273 3274 for (i = 0; i < oprsz / 8; i++) { 3275 uint64_t nn = n[H4(high + i)]; 3276 uint64_t mm = m[H4(high + i)]; 3277 3278 nn = expand_bits(nn, esz); 3279 mm = expand_bits(mm, esz); 3280 d[i] = nn | (mm << esize); 3281 } 3282 } else { 3283 uint8_t *n = vn, *m = vm; 3284 uint16_t *d16 = vd; 3285 3286 for (i = 0; i < oprsz / 2; i++) { 3287 uint16_t nn = n[H1(high + i)]; 3288 uint16_t mm = m[H1(high + i)]; 3289 3290 nn = expand_bits(nn, esz); 3291 mm = expand_bits(mm, esz); 3292 d16[H2(i)] = nn | (mm << esize); 3293 } 3294 } 3295 } 3296 } 3297 3298 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3299 { 3300 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3301 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3302 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3303 uint64_t *d = vd, *n = vn, *m = vm; 3304 uint64_t l, h; 3305 intptr_t i; 3306 3307 if (oprsz <= 8) { 3308 l = compress_bits(n[0] >> odd, esz); 3309 h = compress_bits(m[0] >> odd, esz); 3310 d[0] = l | (h << (4 * oprsz)); 3311 } else { 3312 ARMPredicateReg tmp_m; 3313 intptr_t oprsz_16 = oprsz / 16; 3314 3315 if ((vm - vd) < (uintptr_t)oprsz) { 3316 m = memcpy(&tmp_m, vm, oprsz); 3317 } 3318 3319 for (i = 0; i < oprsz_16; i++) { 3320 l = n[2 * i + 0]; 3321 h = n[2 * i + 1]; 3322 l = compress_bits(l >> odd, esz); 3323 h = compress_bits(h >> odd, esz); 3324 d[i] = l | (h << 32); 3325 } 3326 3327 /* 3328 * For VL which is not a multiple of 512, the results from M do not 3329 * align nicely with the uint64_t for D. Put the aligned results 3330 * from M into TMP_M and then copy it into place afterward. 3331 */ 3332 if (oprsz & 15) { 3333 int final_shift = (oprsz & 15) * 2; 3334 3335 l = n[2 * i + 0]; 3336 h = n[2 * i + 1]; 3337 l = compress_bits(l >> odd, esz); 3338 h = compress_bits(h >> odd, esz); 3339 d[i] = l | (h << final_shift); 3340 3341 for (i = 0; i < oprsz_16; i++) { 3342 l = m[2 * i + 0]; 3343 h = m[2 * i + 1]; 3344 l = compress_bits(l >> odd, esz); 3345 h = compress_bits(h >> odd, esz); 3346 tmp_m.p[i] = l | (h << 32); 3347 } 3348 l = m[2 * i + 0]; 3349 h = m[2 * i + 1]; 3350 l = compress_bits(l >> odd, esz); 3351 h = compress_bits(h >> odd, esz); 3352 tmp_m.p[i] = l | (h << final_shift); 3353 3354 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3355 } else { 3356 for (i = 0; i < oprsz_16; i++) { 3357 l = m[2 * i + 0]; 3358 h = m[2 * i + 1]; 3359 l = compress_bits(l >> odd, esz); 3360 h = compress_bits(h >> odd, esz); 3361 d[oprsz_16 + i] = l | (h << 32); 3362 } 3363 } 3364 } 3365 } 3366 3367 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3368 { 3369 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3370 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3371 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3372 uint64_t *d = vd, *n = vn, *m = vm; 3373 uint64_t mask; 3374 int shr, shl; 3375 intptr_t i; 3376 3377 shl = 1 << esz; 3378 shr = 0; 3379 mask = even_bit_esz_masks[esz]; 3380 if (odd) { 3381 mask <<= shl; 3382 shr = shl; 3383 shl = 0; 3384 } 3385 3386 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3387 uint64_t nn = (n[i] & mask) >> shr; 3388 uint64_t mm = (m[i] & mask) << shl; 3389 d[i] = nn + mm; 3390 } 3391 } 3392 3393 /* Reverse units of 2**N bits. */ 3394 static uint64_t reverse_bits_64(uint64_t x, int n) 3395 { 3396 int i, sh; 3397 3398 x = bswap64(x); 3399 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3400 uint64_t mask = even_bit_esz_masks[i]; 3401 x = ((x & mask) << sh) | ((x >> sh) & mask); 3402 } 3403 return x; 3404 } 3405 3406 static uint8_t reverse_bits_8(uint8_t x, int n) 3407 { 3408 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3409 int i, sh; 3410 3411 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3412 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3413 } 3414 return x; 3415 } 3416 3417 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3418 { 3419 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3420 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3421 intptr_t i, oprsz_2 = oprsz / 2; 3422 3423 if (oprsz <= 8) { 3424 uint64_t l = *(uint64_t *)vn; 3425 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3426 *(uint64_t *)vd = l; 3427 } else if ((oprsz & 15) == 0) { 3428 for (i = 0; i < oprsz_2; i += 8) { 3429 intptr_t ih = oprsz - 8 - i; 3430 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3431 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3432 *(uint64_t *)(vd + i) = h; 3433 *(uint64_t *)(vd + ih) = l; 3434 } 3435 } else { 3436 for (i = 0; i < oprsz_2; i += 1) { 3437 intptr_t il = H1(i); 3438 intptr_t ih = H1(oprsz - 1 - i); 3439 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3440 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3441 *(uint8_t *)(vd + il) = h; 3442 *(uint8_t *)(vd + ih) = l; 3443 } 3444 } 3445 } 3446 3447 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3448 { 3449 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3450 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3451 uint64_t *d = vd; 3452 intptr_t i; 3453 3454 if (oprsz <= 8) { 3455 uint64_t nn = *(uint64_t *)vn; 3456 int half = 4 * oprsz; 3457 3458 nn = extract64(nn, high * half, half); 3459 nn = expand_bits(nn, 0); 3460 d[0] = nn; 3461 } else { 3462 ARMPredicateReg tmp_n; 3463 3464 /* We produce output faster than we consume input. 3465 Therefore we must be mindful of possible overlap. */ 3466 if ((vn - vd) < (uintptr_t)oprsz) { 3467 vn = memcpy(&tmp_n, vn, oprsz); 3468 } 3469 if (high) { 3470 high = oprsz >> 1; 3471 } 3472 3473 if ((oprsz & 7) == 0) { 3474 uint32_t *n = vn; 3475 high >>= 2; 3476 3477 for (i = 0; i < oprsz / 8; i++) { 3478 uint64_t nn = n[H4(high + i)]; 3479 d[i] = expand_bits(nn, 0); 3480 } 3481 } else { 3482 uint16_t *d16 = vd; 3483 uint8_t *n = vn; 3484 3485 for (i = 0; i < oprsz / 2; i++) { 3486 uint16_t nn = n[H1(high + i)]; 3487 d16[H2(i)] = expand_bits(nn, 0); 3488 } 3489 } 3490 } 3491 } 3492 3493 #define DO_ZIP(NAME, TYPE, H) \ 3494 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3495 { \ 3496 intptr_t oprsz = simd_oprsz(desc); \ 3497 intptr_t odd_ofs = simd_data(desc); \ 3498 intptr_t i, oprsz_2 = oprsz / 2; \ 3499 ARMVectorReg tmp_n, tmp_m; \ 3500 /* We produce output faster than we consume input. \ 3501 Therefore we must be mindful of possible overlap. */ \ 3502 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3503 vn = memcpy(&tmp_n, vn, oprsz); \ 3504 } \ 3505 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3506 vm = memcpy(&tmp_m, vm, oprsz); \ 3507 } \ 3508 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3509 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3510 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3511 *(TYPE *)(vm + odd_ofs + H(i)); \ 3512 } \ 3513 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3514 memset(vd + oprsz - 16, 0, 16); \ 3515 } \ 3516 } 3517 3518 DO_ZIP(sve_zip_b, uint8_t, H1) 3519 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3520 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3521 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3522 DO_ZIP(sve2_zip_q, Int128, ) 3523 3524 #define DO_UZP(NAME, TYPE, H) \ 3525 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3526 { \ 3527 intptr_t oprsz = simd_oprsz(desc); \ 3528 intptr_t odd_ofs = simd_data(desc); \ 3529 intptr_t i, p; \ 3530 ARMVectorReg tmp_m; \ 3531 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3532 vm = memcpy(&tmp_m, vm, oprsz); \ 3533 } \ 3534 i = 0, p = odd_ofs; \ 3535 do { \ 3536 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3537 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3538 } while (p < oprsz); \ 3539 p -= oprsz; \ 3540 do { \ 3541 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3542 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3543 } while (p < oprsz); \ 3544 tcg_debug_assert(i == oprsz); \ 3545 } 3546 3547 DO_UZP(sve_uzp_b, uint8_t, H1) 3548 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3549 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3550 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3551 DO_UZP(sve2_uzp_q, Int128, ) 3552 3553 typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc); 3554 3555 static void do_perseg_zzz(void *vd, void *vn, void *vm, 3556 uint32_t desc, perseg_zzz_fn *fn) 3557 { 3558 intptr_t oprsz = simd_oprsz(desc); 3559 3560 desc = simd_desc(16, 16, simd_data(desc)); 3561 for (intptr_t i = 0; i < oprsz; i += 16) { 3562 fn(vd + i, vn + i, vm + i, desc); 3563 } 3564 } 3565 3566 #define DO_PERSEG_ZZZ(NAME, FUNC) \ 3567 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3568 { do_perseg_zzz(vd, vn, vm, desc, FUNC); } 3569 3570 DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b) 3571 DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h) 3572 DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s) 3573 DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d) 3574 3575 DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b) 3576 DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h) 3577 DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s) 3578 DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d) 3579 3580 DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b) 3581 DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h) 3582 DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s) 3583 DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d) 3584 3585 DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b) 3586 DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h) 3587 DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s) 3588 DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d) 3589 3590 #undef DO_PERSEG_ZZZ 3591 3592 #define DO_TRN(NAME, TYPE, H) \ 3593 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3594 { \ 3595 intptr_t oprsz = simd_oprsz(desc); \ 3596 intptr_t odd_ofs = simd_data(desc); \ 3597 intptr_t i; \ 3598 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3599 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3600 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3601 *(TYPE *)(vd + H(i + 0)) = ae; \ 3602 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3603 } \ 3604 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3605 memset(vd + oprsz - 16, 0, 16); \ 3606 } \ 3607 } 3608 3609 DO_TRN(sve_trn_b, uint8_t, H1) 3610 DO_TRN(sve_trn_h, uint16_t, H1_2) 3611 DO_TRN(sve_trn_s, uint32_t, H1_4) 3612 DO_TRN(sve_trn_d, uint64_t, H1_8) 3613 DO_TRN(sve2_trn_q, Int128, ) 3614 3615 #undef DO_ZIP 3616 #undef DO_UZP 3617 #undef DO_TRN 3618 3619 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3620 { 3621 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3622 uint32_t *d = vd, *n = vn; 3623 uint8_t *pg = vg; 3624 3625 for (i = j = 0; i < opr_sz; i++) { 3626 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3627 d[H4(j)] = n[H4(i)]; 3628 j++; 3629 } 3630 } 3631 for (; j < opr_sz; j++) { 3632 d[H4(j)] = 0; 3633 } 3634 } 3635 3636 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3637 { 3638 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3639 uint64_t *d = vd, *n = vn; 3640 uint8_t *pg = vg; 3641 3642 for (i = j = 0; i < opr_sz; i++) { 3643 if (pg[H1(i)] & 1) { 3644 d[j] = n[i]; 3645 j++; 3646 } 3647 } 3648 for (; j < opr_sz; j++) { 3649 d[j] = 0; 3650 } 3651 } 3652 3653 /* Similar to the ARM LastActiveElement pseudocode function, except the 3654 * result is multiplied by the element size. This includes the not found 3655 * indication; e.g. not found for esz=3 is -8. 3656 */ 3657 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3658 { 3659 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3660 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3661 3662 return last_active_element(vg, words, esz); 3663 } 3664 3665 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3666 { 3667 intptr_t opr_sz = simd_oprsz(desc) / 8; 3668 int esz = simd_data(desc); 3669 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3670 intptr_t i, first_i, last_i; 3671 ARMVectorReg tmp; 3672 3673 first_i = last_i = 0; 3674 first_g = last_g = 0; 3675 3676 /* Find the extent of the active elements within VG. */ 3677 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3678 pg = *(uint64_t *)(vg + i) & mask; 3679 if (pg) { 3680 if (last_g == 0) { 3681 last_g = pg; 3682 last_i = i; 3683 } 3684 first_g = pg; 3685 first_i = i; 3686 } 3687 } 3688 3689 len = 0; 3690 if (first_g != 0) { 3691 first_i = first_i * 8 + ctz64(first_g); 3692 last_i = last_i * 8 + 63 - clz64(last_g); 3693 len = last_i - first_i + (1 << esz); 3694 if (vd == vm) { 3695 vm = memcpy(&tmp, vm, opr_sz * 8); 3696 } 3697 swap_memmove(vd, vn + first_i, len); 3698 } 3699 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3700 } 3701 3702 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3703 void *vg, uint32_t desc) 3704 { 3705 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3706 uint64_t *d = vd, *n = vn, *m = vm; 3707 uint8_t *pg = vg; 3708 3709 for (i = 0; i < opr_sz; i += 1) { 3710 uint64_t nn = n[i], mm = m[i]; 3711 uint64_t pp = expand_pred_b(pg[H1(i)]); 3712 d[i] = (nn & pp) | (mm & ~pp); 3713 } 3714 } 3715 3716 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3717 void *vg, uint32_t desc) 3718 { 3719 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3720 uint64_t *d = vd, *n = vn, *m = vm; 3721 uint8_t *pg = vg; 3722 3723 for (i = 0; i < opr_sz; i += 1) { 3724 uint64_t nn = n[i], mm = m[i]; 3725 uint64_t pp = expand_pred_h(pg[H1(i)]); 3726 d[i] = (nn & pp) | (mm & ~pp); 3727 } 3728 } 3729 3730 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3731 void *vg, uint32_t desc) 3732 { 3733 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3734 uint64_t *d = vd, *n = vn, *m = vm; 3735 uint8_t *pg = vg; 3736 3737 for (i = 0; i < opr_sz; i += 1) { 3738 uint64_t nn = n[i], mm = m[i]; 3739 uint64_t pp = expand_pred_s(pg[H1(i)]); 3740 d[i] = (nn & pp) | (mm & ~pp); 3741 } 3742 } 3743 3744 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3745 void *vg, uint32_t desc) 3746 { 3747 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3748 uint64_t *d = vd, *n = vn, *m = vm; 3749 uint8_t *pg = vg; 3750 3751 for (i = 0; i < opr_sz; i += 1) { 3752 uint64_t nn = n[i], mm = m[i]; 3753 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3754 } 3755 } 3756 3757 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3758 void *vg, uint32_t desc) 3759 { 3760 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3761 Int128 *d = vd, *n = vn, *m = vm; 3762 uint16_t *pg = vg; 3763 3764 for (i = 0; i < opr_sz; i += 1) { 3765 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3766 } 3767 } 3768 3769 /* Two operand comparison controlled by a predicate. 3770 * ??? It is very tempting to want to be able to expand this inline 3771 * with x86 instructions, e.g. 3772 * 3773 * vcmpeqw zm, zn, %ymm0 3774 * vpmovmskb %ymm0, %eax 3775 * and $0x5555, %eax 3776 * and pg, %eax 3777 * 3778 * or even aarch64, e.g. 3779 * 3780 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3781 * cmeq v0.8h, zn, zm 3782 * and v0.8h, v0.8h, mask 3783 * addv h0, v0.8h 3784 * and v0.8b, pg 3785 * 3786 * However, coming up with an abstraction that allows vector inputs and 3787 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3788 * scalar outputs, is tricky. 3789 */ 3790 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3791 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3792 { \ 3793 intptr_t opr_sz = simd_oprsz(desc); \ 3794 uint32_t flags = PREDTEST_INIT; \ 3795 intptr_t i = opr_sz; \ 3796 do { \ 3797 uint64_t out = 0, pg; \ 3798 do { \ 3799 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3800 TYPE nn = *(TYPE *)(vn + H(i)); \ 3801 TYPE mm = *(TYPE *)(vm + H(i)); \ 3802 out |= nn OP mm; \ 3803 } while (i & 63); \ 3804 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3805 out &= pg; \ 3806 *(uint64_t *)(vd + (i >> 3)) = out; \ 3807 flags = iter_predtest_bwd(out, pg, flags); \ 3808 } while (i > 0); \ 3809 return flags; \ 3810 } 3811 3812 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3813 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3814 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3815 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3816 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3817 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3818 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3819 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3820 3821 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3822 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3823 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3824 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3825 3826 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3827 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3828 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3829 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3830 3831 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3832 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3833 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3834 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3835 3836 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3837 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3838 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3839 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3840 3841 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3842 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3843 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3844 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3845 3846 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3847 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3848 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3849 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3850 3851 #undef DO_CMP_PPZZ_B 3852 #undef DO_CMP_PPZZ_H 3853 #undef DO_CMP_PPZZ_S 3854 #undef DO_CMP_PPZZ_D 3855 #undef DO_CMP_PPZZ 3856 3857 /* Similar, but the second source is "wide". */ 3858 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3859 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3860 { \ 3861 intptr_t opr_sz = simd_oprsz(desc); \ 3862 uint32_t flags = PREDTEST_INIT; \ 3863 intptr_t i = opr_sz; \ 3864 do { \ 3865 uint64_t out = 0, pg; \ 3866 do { \ 3867 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3868 do { \ 3869 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3870 TYPE nn = *(TYPE *)(vn + H(i)); \ 3871 out |= nn OP mm; \ 3872 } while (i & 7); \ 3873 } while (i & 63); \ 3874 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3875 out &= pg; \ 3876 *(uint64_t *)(vd + (i >> 3)) = out; \ 3877 flags = iter_predtest_bwd(out, pg, flags); \ 3878 } while (i > 0); \ 3879 return flags; \ 3880 } 3881 3882 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3883 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3884 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3885 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3886 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3887 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3888 3889 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3890 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3891 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3892 3893 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3894 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3895 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3896 3897 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3898 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3899 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3900 3901 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3902 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3903 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3904 3905 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3906 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3907 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3908 3909 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3910 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3911 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3912 3913 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3914 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3915 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3916 3917 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3918 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3919 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3920 3921 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3922 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3923 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3924 3925 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3926 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3927 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3928 3929 #undef DO_CMP_PPZW_B 3930 #undef DO_CMP_PPZW_H 3931 #undef DO_CMP_PPZW_S 3932 #undef DO_CMP_PPZW 3933 3934 /* Similar, but the second source is immediate. */ 3935 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3936 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3937 { \ 3938 intptr_t opr_sz = simd_oprsz(desc); \ 3939 uint32_t flags = PREDTEST_INIT; \ 3940 TYPE mm = simd_data(desc); \ 3941 intptr_t i = opr_sz; \ 3942 do { \ 3943 uint64_t out = 0, pg; \ 3944 do { \ 3945 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3946 TYPE nn = *(TYPE *)(vn + H(i)); \ 3947 out |= nn OP mm; \ 3948 } while (i & 63); \ 3949 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3950 out &= pg; \ 3951 *(uint64_t *)(vd + (i >> 3)) = out; \ 3952 flags = iter_predtest_bwd(out, pg, flags); \ 3953 } while (i > 0); \ 3954 return flags; \ 3955 } 3956 3957 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3958 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3959 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3960 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3961 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3962 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3963 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3964 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3965 3966 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3967 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3968 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3969 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3970 3971 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3972 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3973 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3974 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3975 3976 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3977 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3978 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3979 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3980 3981 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3982 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3983 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3984 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3985 3986 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3987 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3988 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3989 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3990 3991 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3992 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3993 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3994 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3995 3996 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3997 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3998 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3999 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 4000 4001 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 4002 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 4003 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 4004 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 4005 4006 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 4007 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 4008 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 4009 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 4010 4011 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 4012 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 4013 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 4014 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 4015 4016 #undef DO_CMP_PPZI_B 4017 #undef DO_CMP_PPZI_H 4018 #undef DO_CMP_PPZI_S 4019 #undef DO_CMP_PPZI_D 4020 #undef DO_CMP_PPZI 4021 4022 /* Similar to the ARM LastActive pseudocode function. */ 4023 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 4024 { 4025 intptr_t i; 4026 4027 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 4028 uint64_t pg = *(uint64_t *)(vg + i); 4029 if (pg) { 4030 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 4031 } 4032 } 4033 return 0; 4034 } 4035 4036 /* Compute a mask into RETB that is true for all G, up to and including 4037 * (if after) or excluding (if !after) the first G & N. 4038 * Return true if BRK found. 4039 */ 4040 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 4041 bool brk, bool after) 4042 { 4043 uint64_t b; 4044 4045 if (brk) { 4046 b = 0; 4047 } else if ((g & n) == 0) { 4048 /* For all G, no N are set; break not found. */ 4049 b = g; 4050 } else { 4051 /* Break somewhere in N. Locate it. */ 4052 b = g & n; /* guard true, pred true */ 4053 b = b & -b; /* first such */ 4054 if (after) { 4055 b = b | (b - 1); /* break after same */ 4056 } else { 4057 b = b - 1; /* break before same */ 4058 } 4059 brk = true; 4060 } 4061 4062 *retb = b; 4063 return brk; 4064 } 4065 4066 /* Compute a zeroing BRK. */ 4067 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 4068 intptr_t oprsz, bool after) 4069 { 4070 bool brk = false; 4071 intptr_t i; 4072 4073 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4074 uint64_t this_b, this_g = g[i]; 4075 4076 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4077 d[i] = this_b & this_g; 4078 } 4079 } 4080 4081 /* Likewise, but also compute flags. */ 4082 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 4083 intptr_t oprsz, bool after) 4084 { 4085 uint32_t flags = PREDTEST_INIT; 4086 bool brk = false; 4087 intptr_t i; 4088 4089 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4090 uint64_t this_b, this_d, this_g = g[i]; 4091 4092 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4093 d[i] = this_d = this_b & this_g; 4094 flags = iter_predtest_fwd(this_d, this_g, flags); 4095 } 4096 return flags; 4097 } 4098 4099 /* Compute a merging BRK. */ 4100 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 4101 intptr_t oprsz, bool after) 4102 { 4103 bool brk = false; 4104 intptr_t i; 4105 4106 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 4107 uint64_t this_b, this_g = g[i]; 4108 4109 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4110 d[i] = (this_b & this_g) | (d[i] & ~this_g); 4111 } 4112 } 4113 4114 /* Likewise, but also compute flags. */ 4115 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 4116 intptr_t oprsz, bool after) 4117 { 4118 uint32_t flags = PREDTEST_INIT; 4119 bool brk = false; 4120 intptr_t i; 4121 4122 for (i = 0; i < oprsz / 8; ++i) { 4123 uint64_t this_b, this_d = d[i], this_g = g[i]; 4124 4125 brk = compute_brk(&this_b, n[i], this_g, brk, after); 4126 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 4127 flags = iter_predtest_fwd(this_d, this_g, flags); 4128 } 4129 return flags; 4130 } 4131 4132 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 4133 uint32_t pred_desc) 4134 { 4135 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4136 if (last_active_pred(vn, vg, oprsz)) { 4137 compute_brk_z(vd, vm, vg, oprsz, true); 4138 } else { 4139 memset(vd, 0, sizeof(ARMPredicateReg)); 4140 } 4141 } 4142 4143 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 4144 uint32_t pred_desc) 4145 { 4146 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4147 if (last_active_pred(vn, vg, oprsz)) { 4148 return compute_brks_z(vd, vm, vg, oprsz, true); 4149 } else { 4150 memset(vd, 0, sizeof(ARMPredicateReg)); 4151 return PREDTEST_INIT; 4152 } 4153 } 4154 4155 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 4156 uint32_t pred_desc) 4157 { 4158 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4159 if (last_active_pred(vn, vg, oprsz)) { 4160 compute_brk_z(vd, vm, vg, oprsz, false); 4161 } else { 4162 memset(vd, 0, sizeof(ARMPredicateReg)); 4163 } 4164 } 4165 4166 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4167 uint32_t pred_desc) 4168 { 4169 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4170 if (last_active_pred(vn, vg, oprsz)) { 4171 return compute_brks_z(vd, vm, vg, oprsz, false); 4172 } else { 4173 memset(vd, 0, sizeof(ARMPredicateReg)); 4174 return PREDTEST_INIT; 4175 } 4176 } 4177 4178 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4179 { 4180 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4181 compute_brk_z(vd, vn, vg, oprsz, true); 4182 } 4183 4184 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4185 { 4186 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4187 return compute_brks_z(vd, vn, vg, oprsz, true); 4188 } 4189 4190 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4191 { 4192 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4193 compute_brk_z(vd, vn, vg, oprsz, false); 4194 } 4195 4196 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4197 { 4198 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4199 return compute_brks_z(vd, vn, vg, oprsz, false); 4200 } 4201 4202 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4203 { 4204 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4205 compute_brk_m(vd, vn, vg, oprsz, true); 4206 } 4207 4208 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4209 { 4210 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4211 return compute_brks_m(vd, vn, vg, oprsz, true); 4212 } 4213 4214 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4215 { 4216 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4217 compute_brk_m(vd, vn, vg, oprsz, false); 4218 } 4219 4220 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4221 { 4222 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4223 return compute_brks_m(vd, vn, vg, oprsz, false); 4224 } 4225 4226 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4227 { 4228 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4229 if (!last_active_pred(vn, vg, oprsz)) { 4230 memset(vd, 0, sizeof(ARMPredicateReg)); 4231 } 4232 } 4233 4234 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4235 { 4236 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4237 if (last_active_pred(vn, vg, oprsz)) { 4238 ARMPredicateReg *d = vd; 4239 uint32_t flags = PREDTEST_INIT; 4240 intptr_t i; 4241 4242 /* As if PredTest(Ones(PL), D, MO_8). */ 4243 for (i = 0; i < oprsz / 8; i++) { 4244 flags = iter_predtest_fwd(d->p[i], -1, flags); 4245 } 4246 if (oprsz & 7) { 4247 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4248 flags = iter_predtest_fwd(d->p[i], mask, flags); 4249 } 4250 return flags; 4251 } 4252 memset(vd, 0, sizeof(ARMPredicateReg)); 4253 return PREDTEST_INIT; 4254 } 4255 4256 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4257 { 4258 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4259 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4260 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4261 intptr_t i; 4262 4263 for (i = 0; i < words; ++i) { 4264 uint64_t t = n[i] & g[i] & mask; 4265 sum += ctpop64(t); 4266 } 4267 return sum; 4268 } 4269 4270 uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc) 4271 { 4272 int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); 4273 int vl = pl * 8; 4274 unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); 4275 int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1; 4276 DecodeCounter p = decode_counter(png, vl, v_esz); 4277 unsigned maxelem = (vl << lg2_width) >> v_esz; 4278 unsigned count = p.count; 4279 4280 if (p.invert) { 4281 if (count >= maxelem) { 4282 return 0; 4283 } 4284 count = maxelem - count; 4285 } else { 4286 count = MIN(count, maxelem); 4287 } 4288 return count >> p.lg2_stride; 4289 } 4290 4291 /* C.f. Arm pseudocode EncodePredCount */ 4292 static uint64_t encode_pred_count(uint32_t elements, uint32_t count, 4293 uint32_t esz, bool invert) 4294 { 4295 uint32_t pred; 4296 4297 if (count == 0) { 4298 return 0; 4299 } 4300 if (invert) { 4301 count = elements - count; 4302 } else if (count == elements) { 4303 count = 0; 4304 invert = true; 4305 } 4306 4307 pred = (count << 1) | 1; 4308 pred <<= esz; 4309 pred |= invert << 15; 4310 4311 return pred; 4312 } 4313 4314 /* C.f. Arm pseudocode PredCountTest */ 4315 static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert) 4316 { 4317 uint32_t flags; 4318 4319 if (count == 0) { 4320 flags = 1; /* !N, Z, C */ 4321 } else if (!invert) { 4322 flags = (1u << 31) | 2; /* N, !Z */ 4323 flags |= count != elements; /* C */ 4324 } else { 4325 flags = 2; /* !Z, !C */ 4326 flags |= (count == elements) << 31; /* N */ 4327 } 4328 return flags; 4329 } 4330 4331 /* D must be cleared on entry. */ 4332 static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask, 4333 uint32_t count, uint32_t oprbits) 4334 { 4335 tcg_debug_assert(count <= oprbits); 4336 if (count) { 4337 uint32_t i; 4338 4339 /* Set all of the requested bits. */ 4340 for (i = 0; i < count / 64; ++i) { 4341 d->p[i] = esz_mask; 4342 } 4343 if (count & 63) { 4344 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4345 } 4346 } 4347 } 4348 4349 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4350 { 4351 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4352 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4353 uint32_t oprbits = oprsz * 8; 4354 uint64_t esz_mask = pred_esz_masks[esz]; 4355 ARMPredicateReg *d = vd; 4356 4357 count <<= esz; 4358 memset(d, 0, sizeof(*d)); 4359 do_whilel(d, esz_mask, count, oprbits); 4360 return pred_count_test(oprbits, count, false); 4361 } 4362 4363 uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc) 4364 { 4365 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4366 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4367 uint32_t oprbits = oprsz * 8; 4368 uint64_t esz_mask = pred_esz_masks[esz]; 4369 ARMPredicateReg *d = vd; 4370 4371 count <<= esz; 4372 memset(d, 0, 2 * sizeof(*d)); 4373 if (count <= oprbits) { 4374 do_whilel(&d[0], esz_mask, count, oprbits); 4375 } else { 4376 do_whilel(&d[0], esz_mask, oprbits, oprbits); 4377 do_whilel(&d[1], esz_mask, count - oprbits, oprbits); 4378 } 4379 4380 return pred_count_test(2 * oprbits, count, false); 4381 } 4382 4383 uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc) 4384 { 4385 uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4386 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4387 uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); 4388 uint32_t vl = pl * 8; 4389 uint32_t elements = (vl >> esz) << scale; 4390 ARMPredicateReg *d = vd; 4391 4392 *d = (ARMPredicateReg) { 4393 .p[0] = encode_pred_count(elements, count, esz, false) 4394 }; 4395 return pred_count_test(elements, count, false); 4396 } 4397 4398 /* D must be cleared on entry. */ 4399 static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask, 4400 uint32_t count, uint32_t oprbits) 4401 { 4402 tcg_debug_assert(count <= oprbits); 4403 if (count) { 4404 uint32_t i, invcount = oprbits - count; 4405 uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64); 4406 4407 for (i = invcount / 64; i < oprbits / 64; ++i) { 4408 d->p[i] = bits; 4409 bits = esz_mask; 4410 } 4411 if (oprbits & 63) { 4412 d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63); 4413 } 4414 } 4415 } 4416 4417 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4418 { 4419 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4420 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4421 uint32_t oprbits = oprsz * 8; 4422 uint64_t esz_mask = pred_esz_masks[esz]; 4423 ARMPredicateReg *d = vd; 4424 4425 count <<= esz; 4426 memset(d, 0, sizeof(*d)); 4427 do_whileg(d, esz_mask, count, oprbits); 4428 return pred_count_test(oprbits, count, true); 4429 } 4430 4431 uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc) 4432 { 4433 uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4434 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4435 uint32_t oprbits = oprsz * 8; 4436 uint64_t esz_mask = pred_esz_masks[esz]; 4437 ARMPredicateReg *d = vd; 4438 4439 count <<= esz; 4440 memset(d, 0, 2 * sizeof(*d)); 4441 if (count <= oprbits) { 4442 do_whileg(&d[1], esz_mask, count, oprbits); 4443 } else { 4444 do_whilel(&d[1], esz_mask, oprbits, oprbits); 4445 do_whileg(&d[0], esz_mask, count - oprbits, oprbits); 4446 } 4447 4448 return pred_count_test(2 * oprbits, count, true); 4449 } 4450 4451 uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc) 4452 { 4453 uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4454 uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4455 uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA); 4456 uint32_t vl = pl * 8; 4457 uint32_t elements = (vl >> esz) << scale; 4458 ARMPredicateReg *d = vd; 4459 4460 *d = (ARMPredicateReg) { 4461 .p[0] = encode_pred_count(elements, count, esz, true) 4462 }; 4463 return pred_count_test(elements, count, true); 4464 } 4465 4466 /* Recursive reduction on a function; 4467 * C.f. the ARM ARM function ReducePredicated. 4468 * 4469 * While it would be possible to write this without the DATA temporary, 4470 * it is much simpler to process the predicate register this way. 4471 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4472 * little to gain with a more complex non-recursive form. 4473 */ 4474 #define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT) \ 4475 static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4476 { \ 4477 if (n == 1) { \ 4478 return *data; \ 4479 } else { \ 4480 uintptr_t half = n / 2; \ 4481 TYPE lo = FUNC##_reduce(data, status, half); \ 4482 TYPE hi = FUNC##_reduce(data + half, status, half); \ 4483 return FUNC(lo, hi, status); \ 4484 } \ 4485 } \ 4486 uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg, \ 4487 float_status *s, uint32_t desc) \ 4488 { \ 4489 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4490 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4491 for (i = 0; i < oprsz; ) { \ 4492 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4493 do { \ 4494 TYPE nn = *(TYPE *)(vn + H(i)); \ 4495 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4496 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4497 } while (i & 15); \ 4498 } \ 4499 for (; i < maxsz; i += sizeof(TYPE)) { \ 4500 *(TYPE *)((void *)data + i) = IDENT; \ 4501 } \ 4502 return FUNC##_reduce(data, s, maxsz / sizeof(TYPE)); \ 4503 } \ 4504 void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg, \ 4505 float_status *status, uint32_t desc) \ 4506 { \ 4507 unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16; \ 4508 for (unsigned e = 0; e < 16; e += sizeof(TYPE)) { \ 4509 TYPE data[ARM_MAX_VQ]; \ 4510 for (unsigned s = 0; s < segments; s++) { \ 4511 uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2)); \ 4512 TYPE nn = *(TYPE *)(vn + H(s * 16 + H(e))); \ 4513 data[s] = (pg >> e) & 1 ? nn : IDENT; \ 4514 } \ 4515 *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \ 4516 } \ 4517 clear_tail(vd, 16, simd_maxsz(desc)); \ 4518 } 4519 4520 DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero) 4521 DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero) 4522 DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero) 4523 4524 /* Identity is floatN_default_nan, without the function call. */ 4525 DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, 0x7E00) 4526 DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, 0x7FC00000) 4527 DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) 4528 4529 DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, 0x7E00) 4530 DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, 0x7FC00000) 4531 DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) 4532 4533 DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity) 4534 DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity) 4535 DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity) 4536 4537 DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity)) 4538 DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity)) 4539 DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity)) 4540 4541 DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) 4542 DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) 4543 DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) 4544 4545 DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh, 4546 float16_chs(float16_infinity)) 4547 DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs, 4548 float32_chs(float32_infinity)) 4549 DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd, 4550 float64_chs(float64_infinity)) 4551 4552 #undef DO_REDUCE 4553 4554 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4555 float_status *status, uint32_t desc) 4556 { 4557 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4558 float16 result = nn; 4559 4560 do { 4561 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4562 do { 4563 if (pg & 1) { 4564 float16 mm = *(float16 *)(vm + H1_2(i)); 4565 result = float16_add(result, mm, status); 4566 } 4567 i += sizeof(float16), pg >>= sizeof(float16); 4568 } while (i & 15); 4569 } while (i < opr_sz); 4570 4571 return result; 4572 } 4573 4574 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4575 float_status *status, uint32_t desc) 4576 { 4577 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4578 float32 result = nn; 4579 4580 do { 4581 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4582 do { 4583 if (pg & 1) { 4584 float32 mm = *(float32 *)(vm + H1_2(i)); 4585 result = float32_add(result, mm, status); 4586 } 4587 i += sizeof(float32), pg >>= sizeof(float32); 4588 } while (i & 15); 4589 } while (i < opr_sz); 4590 4591 return result; 4592 } 4593 4594 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4595 float_status *status, uint32_t desc) 4596 { 4597 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4598 uint64_t *m = vm; 4599 uint8_t *pg = vg; 4600 4601 for (i = 0; i < opr_sz; i++) { 4602 if (pg[H1(i)] & 1) { 4603 nn = float64_add(nn, m[i], status); 4604 } 4605 } 4606 4607 return nn; 4608 } 4609 4610 /* Fully general three-operand expander, controlled by a predicate, 4611 * With the extra float_status parameter. 4612 */ 4613 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4614 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4615 float_status *status, uint32_t desc) \ 4616 { \ 4617 intptr_t i = simd_oprsz(desc); \ 4618 uint64_t *g = vg; \ 4619 do { \ 4620 uint64_t pg = g[(i - 1) >> 6]; \ 4621 do { \ 4622 i -= sizeof(TYPE); \ 4623 if (likely((pg >> (i & 63)) & 1)) { \ 4624 TYPE nn = *(TYPE *)(vn + H(i)); \ 4625 TYPE mm = *(TYPE *)(vm + H(i)); \ 4626 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4627 } \ 4628 } while (i & 63); \ 4629 } while (i != 0); \ 4630 } 4631 4632 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4633 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4634 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4635 4636 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4637 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4638 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4639 4640 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4641 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4642 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4643 4644 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4645 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4646 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4647 4648 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4649 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4650 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4651 4652 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4653 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4654 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4655 4656 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) 4657 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) 4658 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) 4659 4660 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) 4661 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) 4662 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) 4663 4664 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4665 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4666 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4667 4668 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4669 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4670 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4671 4672 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4673 { 4674 return float16_abs(float16_sub(a, b, s)); 4675 } 4676 4677 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4678 { 4679 return float32_abs(float32_sub(a, b, s)); 4680 } 4681 4682 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4683 { 4684 return float64_abs(float64_sub(a, b, s)); 4685 } 4686 4687 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 4688 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat) 4689 { 4690 float16 r = float16_sub(op1, op2, stat); 4691 return float16_is_any_nan(r) ? r : float16_abs(r); 4692 } 4693 4694 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat) 4695 { 4696 float32 r = float32_sub(op1, op2, stat); 4697 return float32_is_any_nan(r) ? r : float32_abs(r); 4698 } 4699 4700 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat) 4701 { 4702 float64 r = float64_sub(op1, op2, stat); 4703 return float64_is_any_nan(r) ? r : float64_abs(r); 4704 } 4705 4706 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4707 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4708 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4709 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h) 4710 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s) 4711 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d) 4712 4713 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4714 { 4715 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4716 return float64_scalbn(a, b_int, s); 4717 } 4718 4719 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4720 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4721 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4722 4723 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4724 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4725 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4726 4727 #undef DO_ZPZZ_FP 4728 4729 /* Three-operand expander, with one scalar operand, controlled by 4730 * a predicate, with the extra float_status parameter. 4731 */ 4732 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4733 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4734 float_status *status, uint32_t desc) \ 4735 { \ 4736 intptr_t i = simd_oprsz(desc); \ 4737 uint64_t *g = vg; \ 4738 TYPE mm = scalar; \ 4739 do { \ 4740 uint64_t pg = g[(i - 1) >> 6]; \ 4741 do { \ 4742 i -= sizeof(TYPE); \ 4743 if (likely((pg >> (i & 63)) & 1)) { \ 4744 TYPE nn = *(TYPE *)(vn + H(i)); \ 4745 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4746 } \ 4747 } while (i & 63); \ 4748 } while (i != 0); \ 4749 } 4750 4751 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4752 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4753 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4754 4755 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4756 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4757 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4758 4759 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4760 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4761 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4762 4763 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4764 { 4765 return float16_sub(b, a, s); 4766 } 4767 4768 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4769 { 4770 return float32_sub(b, a, s); 4771 } 4772 4773 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4774 { 4775 return float64_sub(b, a, s); 4776 } 4777 4778 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4779 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4780 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4781 4782 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4783 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4784 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4785 4786 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4787 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4788 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4789 4790 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4791 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4792 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4793 4794 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4795 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4796 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4797 4798 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh) 4799 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs) 4800 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd) 4801 4802 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh) 4803 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins) 4804 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind) 4805 4806 /* Fully general two-operand expander, controlled by a predicate, 4807 * With the extra float_status parameter. 4808 */ 4809 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4810 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4811 float_status *status, uint32_t desc) \ 4812 { \ 4813 intptr_t i = simd_oprsz(desc); \ 4814 uint64_t *g = vg; \ 4815 do { \ 4816 uint64_t pg = g[(i - 1) >> 6]; \ 4817 do { \ 4818 i -= sizeof(TYPE); \ 4819 if (likely((pg >> (i & 63)) & 1)) { \ 4820 TYPE nn = *(TYPE *)(vn + H(i)); \ 4821 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4822 } \ 4823 } while (i & 63); \ 4824 } while (i != 0); \ 4825 } 4826 4827 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4828 * FZ16. When converting from fp16, this affects flushing input denormals; 4829 * when converting to fp16, this affects flushing output denormals. 4830 */ 4831 float32 sve_f16_to_f32(float16 f, float_status *fpst) 4832 { 4833 bool save = get_flush_inputs_to_zero(fpst); 4834 float32 ret; 4835 4836 set_flush_inputs_to_zero(false, fpst); 4837 ret = float16_to_float32(f, true, fpst); 4838 set_flush_inputs_to_zero(save, fpst); 4839 return ret; 4840 } 4841 4842 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4843 { 4844 bool save = get_flush_inputs_to_zero(fpst); 4845 float64 ret; 4846 4847 set_flush_inputs_to_zero(false, fpst); 4848 ret = float16_to_float64(f, true, fpst); 4849 set_flush_inputs_to_zero(save, fpst); 4850 return ret; 4851 } 4852 4853 float16 sve_f32_to_f16(float32 f, float_status *fpst) 4854 { 4855 bool save = get_flush_to_zero(fpst); 4856 float16 ret; 4857 4858 set_flush_to_zero(false, fpst); 4859 ret = float32_to_float16(f, true, fpst); 4860 set_flush_to_zero(save, fpst); 4861 return ret; 4862 } 4863 4864 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4865 { 4866 bool save = get_flush_to_zero(fpst); 4867 float16 ret; 4868 4869 set_flush_to_zero(false, fpst); 4870 ret = float64_to_float16(f, true, fpst); 4871 set_flush_to_zero(save, fpst); 4872 return ret; 4873 } 4874 4875 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4876 { 4877 if (float16_is_any_nan(f)) { 4878 float_raise(float_flag_invalid, s); 4879 return 0; 4880 } 4881 return float16_to_int16_round_to_zero(f, s); 4882 } 4883 4884 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4885 { 4886 if (float16_is_any_nan(f)) { 4887 float_raise(float_flag_invalid, s); 4888 return 0; 4889 } 4890 return float16_to_int64_round_to_zero(f, s); 4891 } 4892 4893 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4894 { 4895 if (float32_is_any_nan(f)) { 4896 float_raise(float_flag_invalid, s); 4897 return 0; 4898 } 4899 return float32_to_int64_round_to_zero(f, s); 4900 } 4901 4902 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4903 { 4904 if (float64_is_any_nan(f)) { 4905 float_raise(float_flag_invalid, s); 4906 return 0; 4907 } 4908 return float64_to_int64_round_to_zero(f, s); 4909 } 4910 4911 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4912 { 4913 if (float16_is_any_nan(f)) { 4914 float_raise(float_flag_invalid, s); 4915 return 0; 4916 } 4917 return float16_to_uint16_round_to_zero(f, s); 4918 } 4919 4920 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4921 { 4922 if (float16_is_any_nan(f)) { 4923 float_raise(float_flag_invalid, s); 4924 return 0; 4925 } 4926 return float16_to_uint64_round_to_zero(f, s); 4927 } 4928 4929 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4930 { 4931 if (float32_is_any_nan(f)) { 4932 float_raise(float_flag_invalid, s); 4933 return 0; 4934 } 4935 return float32_to_uint64_round_to_zero(f, s); 4936 } 4937 4938 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4939 { 4940 if (float64_is_any_nan(f)) { 4941 float_raise(float_flag_invalid, s); 4942 return 0; 4943 } 4944 return float64_to_uint64_round_to_zero(f, s); 4945 } 4946 4947 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4948 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4949 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4950 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4951 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4952 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4953 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4954 4955 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4956 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4957 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4958 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4959 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4960 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4961 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4962 4963 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4964 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4965 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4966 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4967 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4968 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4969 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4970 4971 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4972 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4973 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4974 4975 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4976 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4977 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4978 4979 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4980 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4981 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4982 4983 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4984 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4985 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4986 4987 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4988 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4989 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4990 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4991 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4992 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4993 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4994 4995 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4996 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4997 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4998 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4999 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 5000 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 5001 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 5002 5003 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 5004 { 5005 /* Extract frac to the top of the uint32_t. */ 5006 uint32_t frac = (uint32_t)a << (16 + 6); 5007 int16_t exp = extract32(a, 10, 5); 5008 5009 if (unlikely(exp == 0)) { 5010 if (frac != 0) { 5011 if (!get_flush_inputs_to_zero(s)) { 5012 /* denormal: bias - fractional_zeros */ 5013 return -15 - clz32(frac); 5014 } 5015 /* flush to zero */ 5016 float_raise(float_flag_input_denormal_flushed, s); 5017 } 5018 } else if (unlikely(exp == 0x1f)) { 5019 if (frac == 0) { 5020 return INT16_MAX; /* infinity */ 5021 } 5022 } else { 5023 /* normal: exp - bias */ 5024 return exp - 15; 5025 } 5026 /* nan or zero */ 5027 float_raise(float_flag_invalid, s); 5028 return INT16_MIN; 5029 } 5030 5031 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 5032 { 5033 /* Extract frac to the top of the uint32_t. */ 5034 uint32_t frac = a << 9; 5035 int32_t exp = extract32(a, 23, 8); 5036 5037 if (unlikely(exp == 0)) { 5038 if (frac != 0) { 5039 if (!get_flush_inputs_to_zero(s)) { 5040 /* denormal: bias - fractional_zeros */ 5041 return -127 - clz32(frac); 5042 } 5043 /* flush to zero */ 5044 float_raise(float_flag_input_denormal_flushed, s); 5045 } 5046 } else if (unlikely(exp == 0xff)) { 5047 if (frac == 0) { 5048 return INT32_MAX; /* infinity */ 5049 } 5050 } else { 5051 /* normal: exp - bias */ 5052 return exp - 127; 5053 } 5054 /* nan or zero */ 5055 float_raise(float_flag_invalid, s); 5056 return INT32_MIN; 5057 } 5058 5059 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 5060 { 5061 /* Extract frac to the top of the uint64_t. */ 5062 uint64_t frac = a << 12; 5063 int64_t exp = extract64(a, 52, 11); 5064 5065 if (unlikely(exp == 0)) { 5066 if (frac != 0) { 5067 if (!get_flush_inputs_to_zero(s)) { 5068 /* denormal: bias - fractional_zeros */ 5069 return -1023 - clz64(frac); 5070 } 5071 /* flush to zero */ 5072 float_raise(float_flag_input_denormal_flushed, s); 5073 } 5074 } else if (unlikely(exp == 0x7ff)) { 5075 if (frac == 0) { 5076 return INT64_MAX; /* infinity */ 5077 } 5078 } else { 5079 /* normal: exp - bias */ 5080 return exp - 1023; 5081 } 5082 /* nan or zero */ 5083 float_raise(float_flag_invalid, s); 5084 return INT64_MIN; 5085 } 5086 5087 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 5088 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 5089 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 5090 5091 #undef DO_ZPZ_FP 5092 5093 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 5094 float_status *status, uint32_t desc, 5095 uint16_t neg1, uint16_t neg3, int flags) 5096 { 5097 intptr_t i = simd_oprsz(desc); 5098 uint64_t *g = vg; 5099 5100 do { 5101 uint64_t pg = g[(i - 1) >> 6]; 5102 do { 5103 i -= 2; 5104 if (likely((pg >> (i & 63)) & 1)) { 5105 float16 e1, e2, e3, r; 5106 5107 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 5108 e2 = *(uint16_t *)(vm + H1_2(i)); 5109 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 5110 r = float16_muladd(e1, e2, e3, flags, status); 5111 *(uint16_t *)(vd + H1_2(i)) = r; 5112 } 5113 } while (i & 63); 5114 } while (i != 0); 5115 } 5116 5117 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5118 void *vg, float_status *status, uint32_t desc) 5119 { 5120 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5121 } 5122 5123 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5124 void *vg, float_status *status, uint32_t desc) 5125 { 5126 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 5127 } 5128 5129 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5130 void *vg, float_status *status, uint32_t desc) 5131 { 5132 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 5133 } 5134 5135 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5136 void *vg, float_status *status, uint32_t desc) 5137 { 5138 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 5139 } 5140 5141 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5142 void *vg, float_status *status, uint32_t desc) 5143 { 5144 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5145 float_muladd_negate_product); 5146 } 5147 5148 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5149 void *vg, float_status *status, uint32_t desc) 5150 { 5151 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5152 float_muladd_negate_product | float_muladd_negate_c); 5153 } 5154 5155 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5156 void *vg, float_status *status, uint32_t desc) 5157 { 5158 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 5159 float_muladd_negate_c); 5160 } 5161 5162 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 5163 float_status *status, uint32_t desc, 5164 uint32_t neg1, uint32_t neg3, int flags) 5165 { 5166 intptr_t i = simd_oprsz(desc); 5167 uint64_t *g = vg; 5168 5169 do { 5170 uint64_t pg = g[(i - 1) >> 6]; 5171 do { 5172 i -= 4; 5173 if (likely((pg >> (i & 63)) & 1)) { 5174 float32 e1, e2, e3, r; 5175 5176 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 5177 e2 = *(uint32_t *)(vm + H1_4(i)); 5178 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 5179 r = float32_muladd(e1, e2, e3, flags, status); 5180 *(uint32_t *)(vd + H1_4(i)) = r; 5181 } 5182 } while (i & 63); 5183 } while (i != 0); 5184 } 5185 5186 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5187 void *vg, float_status *status, uint32_t desc) 5188 { 5189 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5190 } 5191 5192 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5193 void *vg, float_status *status, uint32_t desc) 5194 { 5195 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0); 5196 } 5197 5198 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5199 void *vg, float_status *status, uint32_t desc) 5200 { 5201 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0); 5202 } 5203 5204 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5205 void *vg, float_status *status, uint32_t desc) 5206 { 5207 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0); 5208 } 5209 5210 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5211 void *vg, float_status *status, uint32_t desc) 5212 { 5213 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5214 float_muladd_negate_product); 5215 } 5216 5217 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5218 void *vg, float_status *status, uint32_t desc) 5219 { 5220 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5221 float_muladd_negate_product | float_muladd_negate_c); 5222 } 5223 5224 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5225 void *vg, float_status *status, uint32_t desc) 5226 { 5227 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 5228 float_muladd_negate_c); 5229 } 5230 5231 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 5232 float_status *status, uint32_t desc, 5233 uint64_t neg1, uint64_t neg3, int flags) 5234 { 5235 intptr_t i = simd_oprsz(desc); 5236 uint64_t *g = vg; 5237 5238 do { 5239 uint64_t pg = g[(i - 1) >> 6]; 5240 do { 5241 i -= 8; 5242 if (likely((pg >> (i & 63)) & 1)) { 5243 float64 e1, e2, e3, r; 5244 5245 e1 = *(uint64_t *)(vn + i) ^ neg1; 5246 e2 = *(uint64_t *)(vm + i); 5247 e3 = *(uint64_t *)(va + i) ^ neg3; 5248 r = float64_muladd(e1, e2, e3, flags, status); 5249 *(uint64_t *)(vd + i) = r; 5250 } 5251 } while (i & 63); 5252 } while (i != 0); 5253 } 5254 5255 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5256 void *vg, float_status *status, uint32_t desc) 5257 { 5258 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 5259 } 5260 5261 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5262 void *vg, float_status *status, uint32_t desc) 5263 { 5264 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0); 5265 } 5266 5267 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5268 void *vg, float_status *status, uint32_t desc) 5269 { 5270 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0); 5271 } 5272 5273 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5274 void *vg, float_status *status, uint32_t desc) 5275 { 5276 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0); 5277 } 5278 5279 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5280 void *vg, float_status *status, uint32_t desc) 5281 { 5282 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5283 float_muladd_negate_product); 5284 } 5285 5286 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5287 void *vg, float_status *status, uint32_t desc) 5288 { 5289 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5290 float_muladd_negate_product | float_muladd_negate_c); 5291 } 5292 5293 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5294 void *vg, float_status *status, uint32_t desc) 5295 { 5296 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5297 float_muladd_negate_c); 5298 } 5299 5300 /* Two operand floating-point comparison controlled by a predicate. 5301 * Unlike the integer version, we are not allowed to optimistically 5302 * compare operands, since the comparison may have side effects wrt 5303 * the FPSR. 5304 */ 5305 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 5306 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 5307 float_status *status, uint32_t desc) \ 5308 { \ 5309 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5310 uint64_t *d = vd, *g = vg; \ 5311 do { \ 5312 uint64_t out = 0, pg = g[j]; \ 5313 do { \ 5314 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5315 if (likely((pg >> (i & 63)) & 1)) { \ 5316 TYPE nn = *(TYPE *)(vn + H(i)); \ 5317 TYPE mm = *(TYPE *)(vm + H(i)); \ 5318 out |= OP(TYPE, nn, mm, status); \ 5319 } \ 5320 } while (i & 63); \ 5321 d[j--] = out; \ 5322 } while (i > 0); \ 5323 } 5324 5325 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 5326 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 5327 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 5328 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 5329 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 5330 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 5331 5332 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 5333 DO_FPCMP_PPZZ_H(NAME, OP) \ 5334 DO_FPCMP_PPZZ_S(NAME, OP) \ 5335 DO_FPCMP_PPZZ_D(NAME, OP) 5336 5337 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 5338 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 5339 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 5340 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 5341 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 5342 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 5343 #define DO_FCMUO(TYPE, X, Y, ST) \ 5344 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 5345 #define DO_FACGE(TYPE, X, Y, ST) \ 5346 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 5347 #define DO_FACGT(TYPE, X, Y, ST) \ 5348 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 5349 5350 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 5351 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 5352 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 5353 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 5354 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 5355 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 5356 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 5357 5358 #undef DO_FPCMP_PPZZ_ALL 5359 #undef DO_FPCMP_PPZZ_D 5360 #undef DO_FPCMP_PPZZ_S 5361 #undef DO_FPCMP_PPZZ_H 5362 #undef DO_FPCMP_PPZZ 5363 5364 /* One operand floating-point comparison against zero, controlled 5365 * by a predicate. 5366 */ 5367 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 5368 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 5369 float_status *status, uint32_t desc) \ 5370 { \ 5371 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5372 uint64_t *d = vd, *g = vg; \ 5373 do { \ 5374 uint64_t out = 0, pg = g[j]; \ 5375 do { \ 5376 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5377 if ((pg >> (i & 63)) & 1) { \ 5378 TYPE nn = *(TYPE *)(vn + H(i)); \ 5379 out |= OP(TYPE, nn, 0, status); \ 5380 } \ 5381 } while (i & 63); \ 5382 d[j--] = out; \ 5383 } while (i > 0); \ 5384 } 5385 5386 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 5387 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 5388 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 5389 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 5390 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 5391 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 5392 5393 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 5394 DO_FPCMP_PPZ0_H(NAME, OP) \ 5395 DO_FPCMP_PPZ0_S(NAME, OP) \ 5396 DO_FPCMP_PPZ0_D(NAME, OP) 5397 5398 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 5399 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 5400 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 5401 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 5402 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 5403 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 5404 5405 /* FP Trig Multiply-Add. */ 5406 5407 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 5408 float_status *s, uint32_t desc) 5409 { 5410 static const float16 coeff[16] = { 5411 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5412 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5413 }; 5414 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 5415 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5416 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5417 float16 *d = vd, *n = vn, *m = vm; 5418 5419 for (i = 0; i < opr_sz; i++) { 5420 float16 mm = m[i]; 5421 intptr_t xx = x; 5422 int flags = 0; 5423 5424 if (float16_is_neg(mm)) { 5425 if (fpcr_ah) { 5426 flags = float_muladd_negate_product; 5427 } else { 5428 mm = float16_abs(mm); 5429 } 5430 xx += 8; 5431 } 5432 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s); 5433 } 5434 } 5435 5436 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5437 float_status *s, uint32_t desc) 5438 { 5439 static const float32 coeff[16] = { 5440 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5441 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5442 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5443 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5444 }; 5445 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5446 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5447 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5448 float32 *d = vd, *n = vn, *m = vm; 5449 5450 for (i = 0; i < opr_sz; i++) { 5451 float32 mm = m[i]; 5452 intptr_t xx = x; 5453 int flags = 0; 5454 5455 if (float32_is_neg(mm)) { 5456 if (fpcr_ah) { 5457 flags = float_muladd_negate_product; 5458 } else { 5459 mm = float32_abs(mm); 5460 } 5461 xx += 8; 5462 } 5463 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s); 5464 } 5465 } 5466 5467 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5468 float_status *s, uint32_t desc) 5469 { 5470 static const float64 coeff[16] = { 5471 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5472 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5473 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5474 0x3de5d8408868552full, 0x0000000000000000ull, 5475 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5476 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5477 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5478 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5479 }; 5480 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5481 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5482 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5483 float64 *d = vd, *n = vn, *m = vm; 5484 5485 for (i = 0; i < opr_sz; i++) { 5486 float64 mm = m[i]; 5487 intptr_t xx = x; 5488 int flags = 0; 5489 5490 if (float64_is_neg(mm)) { 5491 if (fpcr_ah) { 5492 flags = float_muladd_negate_product; 5493 } else { 5494 mm = float64_abs(mm); 5495 } 5496 xx += 8; 5497 } 5498 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s); 5499 } 5500 } 5501 5502 /* 5503 * FP Complex Add 5504 */ 5505 5506 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5507 float_status *s, uint32_t desc) 5508 { 5509 intptr_t j, i = simd_oprsz(desc); 5510 uint64_t *g = vg; 5511 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5512 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5513 5514 do { 5515 uint64_t pg = g[(i - 1) >> 6]; 5516 do { 5517 float16 e0, e1, e2, e3; 5518 5519 /* I holds the real index; J holds the imag index. */ 5520 j = i - sizeof(float16); 5521 i -= 2 * sizeof(float16); 5522 5523 e0 = *(float16 *)(vn + H1_2(i)); 5524 e1 = *(float16 *)(vm + H1_2(j)); 5525 e2 = *(float16 *)(vn + H1_2(j)); 5526 e3 = *(float16 *)(vm + H1_2(i)); 5527 5528 if (rot) { 5529 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 5530 } else { 5531 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 5532 } 5533 5534 if (likely((pg >> (i & 63)) & 1)) { 5535 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5536 } 5537 if (likely((pg >> (j & 63)) & 1)) { 5538 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5539 } 5540 } while (i & 63); 5541 } while (i != 0); 5542 } 5543 5544 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5545 float_status *s, uint32_t desc) 5546 { 5547 intptr_t j, i = simd_oprsz(desc); 5548 uint64_t *g = vg; 5549 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5550 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5551 5552 do { 5553 uint64_t pg = g[(i - 1) >> 6]; 5554 do { 5555 float32 e0, e1, e2, e3; 5556 5557 /* I holds the real index; J holds the imag index. */ 5558 j = i - sizeof(float32); 5559 i -= 2 * sizeof(float32); 5560 5561 e0 = *(float32 *)(vn + H1_2(i)); 5562 e1 = *(float32 *)(vm + H1_2(j)); 5563 e2 = *(float32 *)(vn + H1_2(j)); 5564 e3 = *(float32 *)(vm + H1_2(i)); 5565 5566 if (rot) { 5567 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 5568 } else { 5569 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 5570 } 5571 5572 if (likely((pg >> (i & 63)) & 1)) { 5573 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5574 } 5575 if (likely((pg >> (j & 63)) & 1)) { 5576 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5577 } 5578 } while (i & 63); 5579 } while (i != 0); 5580 } 5581 5582 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5583 float_status *s, uint32_t desc) 5584 { 5585 intptr_t j, i = simd_oprsz(desc); 5586 uint64_t *g = vg; 5587 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5588 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5589 5590 do { 5591 uint64_t pg = g[(i - 1) >> 6]; 5592 do { 5593 float64 e0, e1, e2, e3; 5594 5595 /* I holds the real index; J holds the imag index. */ 5596 j = i - sizeof(float64); 5597 i -= 2 * sizeof(float64); 5598 5599 e0 = *(float64 *)(vn + H1_2(i)); 5600 e1 = *(float64 *)(vm + H1_2(j)); 5601 e2 = *(float64 *)(vn + H1_2(j)); 5602 e3 = *(float64 *)(vm + H1_2(i)); 5603 5604 if (rot) { 5605 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 5606 } else { 5607 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 5608 } 5609 5610 if (likely((pg >> (i & 63)) & 1)) { 5611 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5612 } 5613 if (likely((pg >> (j & 63)) & 1)) { 5614 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5615 } 5616 } while (i & 63); 5617 } while (i != 0); 5618 } 5619 5620 /* 5621 * FP Complex Multiply 5622 */ 5623 5624 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5625 void *vg, float_status *status, uint32_t desc) 5626 { 5627 intptr_t j, i = simd_oprsz(desc); 5628 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5629 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5630 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5631 uint32_t negf_real = flip ^ negf_imag; 5632 float16 negx_imag, negx_real; 5633 uint64_t *g = vg; 5634 5635 /* With AH=0, use negx; with AH=1 use negf. */ 5636 negx_real = (negf_real & ~fpcr_ah) << 15; 5637 negx_imag = (negf_imag & ~fpcr_ah) << 15; 5638 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5639 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5640 5641 do { 5642 uint64_t pg = g[(i - 1) >> 6]; 5643 do { 5644 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5645 5646 /* I holds the real index; J holds the imag index. */ 5647 j = i - sizeof(float16); 5648 i -= 2 * sizeof(float16); 5649 5650 nr = *(float16 *)(vn + H1_2(i)); 5651 ni = *(float16 *)(vn + H1_2(j)); 5652 mr = *(float16 *)(vm + H1_2(i)); 5653 mi = *(float16 *)(vm + H1_2(j)); 5654 5655 e2 = (flip ? ni : nr); 5656 e1 = (flip ? mi : mr) ^ negx_real; 5657 e4 = e2; 5658 e3 = (flip ? mr : mi) ^ negx_imag; 5659 5660 if (likely((pg >> (i & 63)) & 1)) { 5661 d = *(float16 *)(va + H1_2(i)); 5662 d = float16_muladd(e2, e1, d, negf_real, status); 5663 *(float16 *)(vd + H1_2(i)) = d; 5664 } 5665 if (likely((pg >> (j & 63)) & 1)) { 5666 d = *(float16 *)(va + H1_2(j)); 5667 d = float16_muladd(e4, e3, d, negf_imag, status); 5668 *(float16 *)(vd + H1_2(j)) = d; 5669 } 5670 } while (i & 63); 5671 } while (i != 0); 5672 } 5673 5674 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5675 void *vg, float_status *status, uint32_t desc) 5676 { 5677 intptr_t j, i = simd_oprsz(desc); 5678 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5679 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5680 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5681 uint32_t negf_real = flip ^ negf_imag; 5682 float32 negx_imag, negx_real; 5683 uint64_t *g = vg; 5684 5685 /* With AH=0, use negx; with AH=1 use negf. */ 5686 negx_real = (negf_real & ~fpcr_ah) << 31; 5687 negx_imag = (negf_imag & ~fpcr_ah) << 31; 5688 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5689 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5690 5691 do { 5692 uint64_t pg = g[(i - 1) >> 6]; 5693 do { 5694 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5695 5696 /* I holds the real index; J holds the imag index. */ 5697 j = i - sizeof(float32); 5698 i -= 2 * sizeof(float32); 5699 5700 nr = *(float32 *)(vn + H1_2(i)); 5701 ni = *(float32 *)(vn + H1_2(j)); 5702 mr = *(float32 *)(vm + H1_2(i)); 5703 mi = *(float32 *)(vm + H1_2(j)); 5704 5705 e2 = (flip ? ni : nr); 5706 e1 = (flip ? mi : mr) ^ negx_real; 5707 e4 = e2; 5708 e3 = (flip ? mr : mi) ^ negx_imag; 5709 5710 if (likely((pg >> (i & 63)) & 1)) { 5711 d = *(float32 *)(va + H1_2(i)); 5712 d = float32_muladd(e2, e1, d, negf_real, status); 5713 *(float32 *)(vd + H1_2(i)) = d; 5714 } 5715 if (likely((pg >> (j & 63)) & 1)) { 5716 d = *(float32 *)(va + H1_2(j)); 5717 d = float32_muladd(e4, e3, d, negf_imag, status); 5718 *(float32 *)(vd + H1_2(j)) = d; 5719 } 5720 } while (i & 63); 5721 } while (i != 0); 5722 } 5723 5724 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5725 void *vg, float_status *status, uint32_t desc) 5726 { 5727 intptr_t j, i = simd_oprsz(desc); 5728 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5729 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5730 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5731 uint32_t negf_real = flip ^ negf_imag; 5732 float64 negx_imag, negx_real; 5733 uint64_t *g = vg; 5734 5735 /* With AH=0, use negx; with AH=1 use negf. */ 5736 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 5737 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 5738 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5739 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5740 5741 do { 5742 uint64_t pg = g[(i - 1) >> 6]; 5743 do { 5744 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5745 5746 /* I holds the real index; J holds the imag index. */ 5747 j = i - sizeof(float64); 5748 i -= 2 * sizeof(float64); 5749 5750 nr = *(float64 *)(vn + H1_2(i)); 5751 ni = *(float64 *)(vn + H1_2(j)); 5752 mr = *(float64 *)(vm + H1_2(i)); 5753 mi = *(float64 *)(vm + H1_2(j)); 5754 5755 e2 = (flip ? ni : nr); 5756 e1 = (flip ? mi : mr) ^ negx_real; 5757 e4 = e2; 5758 e3 = (flip ? mr : mi) ^ negx_imag; 5759 5760 if (likely((pg >> (i & 63)) & 1)) { 5761 d = *(float64 *)(va + H1_2(i)); 5762 d = float64_muladd(e2, e1, d, negf_real, status); 5763 *(float64 *)(vd + H1_2(i)) = d; 5764 } 5765 if (likely((pg >> (j & 63)) & 1)) { 5766 d = *(float64 *)(va + H1_2(j)); 5767 d = float64_muladd(e4, e3, d, negf_imag, status); 5768 *(float64 *)(vd + H1_2(j)) = d; 5769 } 5770 } while (i & 63); 5771 } while (i != 0); 5772 } 5773 5774 /* 5775 * Load contiguous data, protected by a governing predicate. 5776 */ 5777 5778 /* 5779 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5780 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5781 * element >= @reg_off, or @reg_max if there were no active elements at all. 5782 */ 5783 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5784 intptr_t reg_max, int esz) 5785 { 5786 uint64_t pg_mask = pred_esz_masks[esz]; 5787 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5788 5789 /* In normal usage, the first element is active. */ 5790 if (likely(pg & 1)) { 5791 return reg_off; 5792 } 5793 5794 if (pg == 0) { 5795 reg_off &= -64; 5796 do { 5797 reg_off += 64; 5798 if (unlikely(reg_off >= reg_max)) { 5799 /* The entire predicate was false. */ 5800 return reg_max; 5801 } 5802 pg = vg[reg_off >> 6] & pg_mask; 5803 } while (pg == 0); 5804 } 5805 reg_off += ctz64(pg); 5806 5807 /* We should never see an out of range predicate bit set. */ 5808 tcg_debug_assert(reg_off < reg_max); 5809 return reg_off; 5810 } 5811 5812 /* 5813 * Resolve the guest virtual address to info->host and info->flags. 5814 * If @nofault, return false if the page is invalid, otherwise 5815 * exit via page fault exception. 5816 */ 5817 5818 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5819 target_ulong addr, int mem_off, MMUAccessType access_type, 5820 int mmu_idx, uintptr_t retaddr) 5821 { 5822 int flags; 5823 5824 addr += mem_off; 5825 5826 /* 5827 * User-only currently always issues with TBI. See the comment 5828 * above useronly_clean_ptr. Usually we clean this top byte away 5829 * during translation, but we can't do that for e.g. vector + imm 5830 * addressing modes. 5831 * 5832 * We currently always enable TBI for user-only, and do not provide 5833 * a way to turn it off. So clean the pointer unconditionally here, 5834 * rather than look it up here, or pass it down from above. 5835 */ 5836 addr = useronly_clean_ptr(addr); 5837 5838 #ifdef CONFIG_USER_ONLY 5839 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5840 &info->host, retaddr); 5841 #else 5842 CPUTLBEntryFull *full; 5843 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5844 &info->host, &full, retaddr); 5845 #endif 5846 info->flags = flags; 5847 5848 if (flags & TLB_INVALID_MASK) { 5849 g_assert(nofault); 5850 return false; 5851 } 5852 5853 #ifdef CONFIG_USER_ONLY 5854 memset(&info->attrs, 0, sizeof(info->attrs)); 5855 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5856 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5857 #else 5858 info->attrs = full->attrs; 5859 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5860 #endif 5861 5862 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5863 info->host -= mem_off; 5864 return true; 5865 } 5866 5867 /* 5868 * Find first active element on each page, and a loose bound for the 5869 * final element on each page. Identify any single element that spans 5870 * the page boundary. Return true if there are any active elements. 5871 */ 5872 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5873 intptr_t reg_max, int esz, int msize) 5874 { 5875 const int esize = 1 << esz; 5876 const uint64_t pg_mask = pred_esz_masks[esz]; 5877 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5878 intptr_t mem_off_last, mem_off_split; 5879 intptr_t page_split, elt_split; 5880 intptr_t i; 5881 5882 /* Set all of the element indices to -1, and the TLB data to 0. */ 5883 memset(info, -1, offsetof(SVEContLdSt, page)); 5884 memset(info->page, 0, sizeof(info->page)); 5885 5886 /* Gross scan over the entire predicate to find bounds. */ 5887 i = 0; 5888 do { 5889 uint64_t pg = vg[i] & pg_mask; 5890 if (pg) { 5891 reg_off_last = i * 64 + 63 - clz64(pg); 5892 if (reg_off_first < 0) { 5893 reg_off_first = i * 64 + ctz64(pg); 5894 } 5895 } 5896 } while (++i * 64 < reg_max); 5897 5898 if (unlikely(reg_off_first < 0)) { 5899 /* No active elements, no pages touched. */ 5900 return false; 5901 } 5902 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5903 5904 info->reg_off_first[0] = reg_off_first; 5905 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5906 mem_off_last = (reg_off_last >> esz) * msize; 5907 5908 page_split = -(addr | TARGET_PAGE_MASK); 5909 if (likely(mem_off_last + msize <= page_split)) { 5910 /* The entire operation fits within a single page. */ 5911 info->reg_off_last[0] = reg_off_last; 5912 return true; 5913 } 5914 5915 info->page_split = page_split; 5916 elt_split = page_split / msize; 5917 reg_off_split = elt_split << esz; 5918 mem_off_split = elt_split * msize; 5919 5920 /* 5921 * This is the last full element on the first page, but it is not 5922 * necessarily active. If there is no full element, i.e. the first 5923 * active element is the one that's split, this value remains -1. 5924 * It is useful as iteration bounds. 5925 */ 5926 if (elt_split != 0) { 5927 info->reg_off_last[0] = reg_off_split - esize; 5928 } 5929 5930 /* Determine if an unaligned element spans the pages. */ 5931 if (page_split % msize != 0) { 5932 /* It is helpful to know if the split element is active. */ 5933 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5934 info->reg_off_split = reg_off_split; 5935 info->mem_off_split = mem_off_split; 5936 5937 if (reg_off_split == reg_off_last) { 5938 /* The page crossing element is last. */ 5939 return true; 5940 } 5941 } 5942 reg_off_split += esize; 5943 mem_off_split += msize; 5944 } 5945 5946 /* 5947 * We do want the first active element on the second page, because 5948 * this may affect the address reported in an exception. 5949 */ 5950 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5951 tcg_debug_assert(reg_off_split <= reg_off_last); 5952 info->reg_off_first[1] = reg_off_split; 5953 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5954 info->reg_off_last[1] = reg_off_last; 5955 return true; 5956 } 5957 5958 /* 5959 * Resolve the guest virtual addresses to info->page[]. 5960 * Control the generation of page faults with @fault. Return false if 5961 * there is no work to do, which can only happen with @fault == FAULT_NO. 5962 */ 5963 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5964 CPUARMState *env, target_ulong addr, 5965 MMUAccessType access_type, uintptr_t retaddr) 5966 { 5967 int mmu_idx = arm_env_mmu_index(env); 5968 int mem_off = info->mem_off_first[0]; 5969 bool nofault = fault == FAULT_NO; 5970 bool have_work = true; 5971 5972 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5973 access_type, mmu_idx, retaddr)) { 5974 /* No work to be done. */ 5975 return false; 5976 } 5977 5978 if (likely(info->page_split < 0)) { 5979 /* The entire operation was on the one page. */ 5980 return true; 5981 } 5982 5983 /* 5984 * If the second page is invalid, then we want the fault address to be 5985 * the first byte on that page which is accessed. 5986 */ 5987 if (info->mem_off_split >= 0) { 5988 /* 5989 * There is an element split across the pages. The fault address 5990 * should be the first byte of the second page. 5991 */ 5992 mem_off = info->page_split; 5993 /* 5994 * If the split element is also the first active element 5995 * of the vector, then: For first-fault we should continue 5996 * to generate faults for the second page. For no-fault, 5997 * we have work only if the second page is valid. 5998 */ 5999 if (info->mem_off_first[0] < info->mem_off_split) { 6000 nofault = FAULT_FIRST; 6001 have_work = false; 6002 } 6003 } else { 6004 /* 6005 * There is no element split across the pages. The fault address 6006 * should be the first active element on the second page. 6007 */ 6008 mem_off = info->mem_off_first[1]; 6009 /* 6010 * There must have been one active element on the first page, 6011 * so we're out of first-fault territory. 6012 */ 6013 nofault = fault != FAULT_ALL; 6014 } 6015 6016 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 6017 access_type, mmu_idx, retaddr); 6018 return have_work; 6019 } 6020 6021 #ifndef CONFIG_USER_ONLY 6022 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 6023 uint64_t *vg, target_ulong addr, 6024 int esize, int msize, int wp_access, 6025 uintptr_t retaddr) 6026 { 6027 intptr_t mem_off, reg_off, reg_last; 6028 int flags0 = info->page[0].flags; 6029 int flags1 = info->page[1].flags; 6030 6031 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 6032 return; 6033 } 6034 6035 /* Indicate that watchpoints are handled. */ 6036 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 6037 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 6038 6039 if (flags0 & TLB_WATCHPOINT) { 6040 mem_off = info->mem_off_first[0]; 6041 reg_off = info->reg_off_first[0]; 6042 reg_last = info->reg_off_last[0]; 6043 6044 while (reg_off <= reg_last) { 6045 uint64_t pg = vg[reg_off >> 6]; 6046 do { 6047 if ((pg >> (reg_off & 63)) & 1) { 6048 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 6049 msize, info->page[0].attrs, 6050 wp_access, retaddr); 6051 } 6052 reg_off += esize; 6053 mem_off += msize; 6054 } while (reg_off <= reg_last && (reg_off & 63)); 6055 } 6056 } 6057 6058 mem_off = info->mem_off_split; 6059 if (mem_off >= 0) { 6060 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 6061 info->page[0].attrs, wp_access, retaddr); 6062 } 6063 6064 mem_off = info->mem_off_first[1]; 6065 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 6066 reg_off = info->reg_off_first[1]; 6067 reg_last = info->reg_off_last[1]; 6068 6069 do { 6070 uint64_t pg = vg[reg_off >> 6]; 6071 do { 6072 if ((pg >> (reg_off & 63)) & 1) { 6073 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 6074 msize, info->page[1].attrs, 6075 wp_access, retaddr); 6076 } 6077 reg_off += esize; 6078 mem_off += msize; 6079 } while (reg_off & 63); 6080 } while (reg_off <= reg_last); 6081 } 6082 } 6083 #endif 6084 6085 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 6086 uint64_t *vg, target_ulong addr, int esize, 6087 int msize, uint32_t mtedesc, uintptr_t ra) 6088 { 6089 intptr_t mem_off, reg_off, reg_last; 6090 6091 /* Process the page only if MemAttr == Tagged. */ 6092 if (info->page[0].tagged) { 6093 mem_off = info->mem_off_first[0]; 6094 reg_off = info->reg_off_first[0]; 6095 reg_last = info->reg_off_split; 6096 if (reg_last < 0) { 6097 reg_last = info->reg_off_last[0]; 6098 } 6099 6100 do { 6101 uint64_t pg = vg[reg_off >> 6]; 6102 do { 6103 if ((pg >> (reg_off & 63)) & 1) { 6104 mte_check(env, mtedesc, addr, ra); 6105 } 6106 reg_off += esize; 6107 mem_off += msize; 6108 } while (reg_off <= reg_last && (reg_off & 63)); 6109 } while (reg_off <= reg_last); 6110 } 6111 6112 mem_off = info->mem_off_first[1]; 6113 if (mem_off >= 0 && info->page[1].tagged) { 6114 reg_off = info->reg_off_first[1]; 6115 reg_last = info->reg_off_last[1]; 6116 6117 do { 6118 uint64_t pg = vg[reg_off >> 6]; 6119 do { 6120 if ((pg >> (reg_off & 63)) & 1) { 6121 mte_check(env, mtedesc, addr, ra); 6122 } 6123 reg_off += esize; 6124 mem_off += msize; 6125 } while (reg_off & 63); 6126 } while (reg_off <= reg_last); 6127 } 6128 } 6129 6130 /* 6131 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6132 */ 6133 static inline QEMU_ALWAYS_INLINE 6134 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 6135 uint32_t desc, const uintptr_t retaddr, 6136 const int esz, const int msz, const int N, uint32_t mtedesc, 6137 sve_ldst1_host_fn *host_fn, 6138 sve_ldst1_tlb_fn *tlb_fn) 6139 { 6140 const unsigned rd = simd_data(desc); 6141 const intptr_t reg_max = simd_oprsz(desc); 6142 intptr_t reg_off, reg_last, mem_off; 6143 SVEContLdSt info; 6144 void *host; 6145 int flags, i; 6146 6147 /* Find the active elements. */ 6148 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6149 /* The entire predicate was false; no load occurs. */ 6150 for (i = 0; i < N; ++i) { 6151 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 6152 } 6153 return; 6154 } 6155 6156 /* Probe the page(s). Exit with exception for any invalid page. */ 6157 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 6158 6159 /* Handle watchpoints for all active elements. */ 6160 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6161 BP_MEM_READ, retaddr); 6162 6163 /* 6164 * Handle mte checks for all active elements. 6165 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6166 */ 6167 if (mtedesc) { 6168 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6169 mtedesc, retaddr); 6170 } 6171 6172 flags = info.page[0].flags | info.page[1].flags; 6173 if (unlikely(flags != 0)) { 6174 /* 6175 * At least one page includes MMIO. 6176 * Any bus operation can fail with cpu_transaction_failed, 6177 * which for ARM will raise SyncExternal. Perform the load 6178 * into scratch memory to preserve register state until the end. 6179 */ 6180 ARMVectorReg scratch[4] = { }; 6181 6182 mem_off = info.mem_off_first[0]; 6183 reg_off = info.reg_off_first[0]; 6184 reg_last = info.reg_off_last[1]; 6185 if (reg_last < 0) { 6186 reg_last = info.reg_off_split; 6187 if (reg_last < 0) { 6188 reg_last = info.reg_off_last[0]; 6189 } 6190 } 6191 6192 do { 6193 uint64_t pg = vg[reg_off >> 6]; 6194 do { 6195 if ((pg >> (reg_off & 63)) & 1) { 6196 for (i = 0; i < N; ++i) { 6197 tlb_fn(env, &scratch[i], reg_off, 6198 addr + mem_off + (i << msz), retaddr); 6199 } 6200 } 6201 reg_off += 1 << esz; 6202 mem_off += N << msz; 6203 } while (reg_off & 63); 6204 } while (reg_off <= reg_last); 6205 6206 for (i = 0; i < N; ++i) { 6207 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 6208 } 6209 return; 6210 } 6211 6212 /* The entire operation is in RAM, on valid pages. */ 6213 6214 for (i = 0; i < N; ++i) { 6215 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 6216 } 6217 6218 mem_off = info.mem_off_first[0]; 6219 reg_off = info.reg_off_first[0]; 6220 reg_last = info.reg_off_last[0]; 6221 host = info.page[0].host; 6222 6223 set_helper_retaddr(retaddr); 6224 6225 while (reg_off <= reg_last) { 6226 uint64_t pg = vg[reg_off >> 6]; 6227 do { 6228 if ((pg >> (reg_off & 63)) & 1) { 6229 for (i = 0; i < N; ++i) { 6230 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6231 host + mem_off + (i << msz)); 6232 } 6233 } 6234 reg_off += 1 << esz; 6235 mem_off += N << msz; 6236 } while (reg_off <= reg_last && (reg_off & 63)); 6237 } 6238 6239 clear_helper_retaddr(); 6240 6241 /* 6242 * Use the slow path to manage the cross-page misalignment. 6243 * But we know this is RAM and cannot trap. 6244 */ 6245 mem_off = info.mem_off_split; 6246 if (unlikely(mem_off >= 0)) { 6247 reg_off = info.reg_off_split; 6248 for (i = 0; i < N; ++i) { 6249 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6250 addr + mem_off + (i << msz), retaddr); 6251 } 6252 } 6253 6254 mem_off = info.mem_off_first[1]; 6255 if (unlikely(mem_off >= 0)) { 6256 reg_off = info.reg_off_first[1]; 6257 reg_last = info.reg_off_last[1]; 6258 host = info.page[1].host; 6259 6260 set_helper_retaddr(retaddr); 6261 6262 do { 6263 uint64_t pg = vg[reg_off >> 6]; 6264 do { 6265 if ((pg >> (reg_off & 63)) & 1) { 6266 for (i = 0; i < N; ++i) { 6267 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6268 host + mem_off + (i << msz)); 6269 } 6270 } 6271 reg_off += 1 << esz; 6272 mem_off += N << msz; 6273 } while (reg_off & 63); 6274 } while (reg_off <= reg_last); 6275 6276 clear_helper_retaddr(); 6277 } 6278 } 6279 6280 static inline QEMU_ALWAYS_INLINE 6281 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6282 uint32_t desc, const uintptr_t ra, 6283 const int esz, const int msz, const int N, 6284 sve_ldst1_host_fn *host_fn, 6285 sve_ldst1_tlb_fn *tlb_fn) 6286 { 6287 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6288 int bit55 = extract64(addr, 55, 1); 6289 6290 /* Remove mtedesc from the normal sve descriptor. */ 6291 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6292 6293 /* Perform gross MTE suppression early. */ 6294 if (!tbi_check(mtedesc, bit55) || 6295 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6296 mtedesc = 0; 6297 } 6298 6299 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6300 } 6301 6302 #define DO_LD1_1(NAME, ESZ) \ 6303 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 6304 target_ulong addr, uint32_t desc) \ 6305 { \ 6306 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 6307 sve_##NAME##_host, sve_##NAME##_tlb); \ 6308 } \ 6309 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6310 target_ulong addr, uint32_t desc) \ 6311 { \ 6312 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 6313 sve_##NAME##_host, sve_##NAME##_tlb); \ 6314 } 6315 6316 #define DO_LD1_2(NAME, ESZ, MSZ) \ 6317 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 6318 target_ulong addr, uint32_t desc) \ 6319 { \ 6320 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6321 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6322 } \ 6323 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 6324 target_ulong addr, uint32_t desc) \ 6325 { \ 6326 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6327 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6328 } \ 6329 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6330 target_ulong addr, uint32_t desc) \ 6331 { \ 6332 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6333 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6334 } \ 6335 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6336 target_ulong addr, uint32_t desc) \ 6337 { \ 6338 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6339 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6340 } 6341 6342 DO_LD1_1(ld1bb, MO_8) 6343 DO_LD1_1(ld1bhu, MO_16) 6344 DO_LD1_1(ld1bhs, MO_16) 6345 DO_LD1_1(ld1bsu, MO_32) 6346 DO_LD1_1(ld1bss, MO_32) 6347 DO_LD1_1(ld1bdu, MO_64) 6348 DO_LD1_1(ld1bds, MO_64) 6349 6350 DO_LD1_2(ld1hh, MO_16, MO_16) 6351 DO_LD1_2(ld1hsu, MO_32, MO_16) 6352 DO_LD1_2(ld1hss, MO_32, MO_16) 6353 DO_LD1_2(ld1hdu, MO_64, MO_16) 6354 DO_LD1_2(ld1hds, MO_64, MO_16) 6355 6356 DO_LD1_2(ld1ss, MO_32, MO_32) 6357 DO_LD1_2(ld1sdu, MO_64, MO_32) 6358 DO_LD1_2(ld1sds, MO_64, MO_32) 6359 6360 DO_LD1_2(ld1dd, MO_64, MO_64) 6361 6362 DO_LD1_2(ld1squ, MO_32, MO_128) 6363 DO_LD1_2(ld1dqu, MO_64, MO_128) 6364 6365 #undef DO_LD1_1 6366 #undef DO_LD1_2 6367 6368 #define DO_LDN_1(N) \ 6369 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 6370 target_ulong addr, uint32_t desc) \ 6371 { \ 6372 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 6373 sve_ld1bb_host, sve_ld1bb_tlb); \ 6374 } \ 6375 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 6376 target_ulong addr, uint32_t desc) \ 6377 { \ 6378 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 6379 sve_ld1bb_host, sve_ld1bb_tlb); \ 6380 } 6381 6382 #define DO_LDN_2(N, SUFF, ESZ) \ 6383 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 6384 target_ulong addr, uint32_t desc) \ 6385 { \ 6386 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6387 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6388 } \ 6389 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 6390 target_ulong addr, uint32_t desc) \ 6391 { \ 6392 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6393 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6394 } \ 6395 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 6396 target_ulong addr, uint32_t desc) \ 6397 { \ 6398 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6399 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6400 } \ 6401 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 6402 target_ulong addr, uint32_t desc) \ 6403 { \ 6404 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6405 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6406 } 6407 6408 DO_LDN_1(2) 6409 DO_LDN_1(3) 6410 DO_LDN_1(4) 6411 6412 DO_LDN_2(2, hh, MO_16) 6413 DO_LDN_2(3, hh, MO_16) 6414 DO_LDN_2(4, hh, MO_16) 6415 6416 DO_LDN_2(2, ss, MO_32) 6417 DO_LDN_2(3, ss, MO_32) 6418 DO_LDN_2(4, ss, MO_32) 6419 6420 DO_LDN_2(2, dd, MO_64) 6421 DO_LDN_2(3, dd, MO_64) 6422 DO_LDN_2(4, dd, MO_64) 6423 6424 DO_LDN_2(2, qq, MO_128) 6425 DO_LDN_2(3, qq, MO_128) 6426 DO_LDN_2(4, qq, MO_128) 6427 6428 #undef DO_LDN_1 6429 #undef DO_LDN_2 6430 6431 /* 6432 * Load contiguous data, first-fault and no-fault. 6433 * 6434 * For user-only, we control the race between page_check_range and 6435 * another thread's munmap by using set/clear_helper_retaddr. Any 6436 * SEGV that occurs between those markers is assumed to be because 6437 * the guest page vanished. Keep that block as small as possible 6438 * so that unrelated QEMU bugs are not blamed on the guest. 6439 */ 6440 6441 /* Fault on byte I. All bits in FFR from I are cleared. The vector 6442 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 6443 * option, which leaves subsequent data unchanged. 6444 */ 6445 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 6446 { 6447 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 6448 6449 if (i & 63) { 6450 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 6451 i = ROUND_UP(i, 64); 6452 } 6453 for (; i < oprsz; i += 64) { 6454 ffr[i / 64] = 0; 6455 } 6456 } 6457 6458 /* 6459 * Common helper for all contiguous no-fault and first-fault loads. 6460 */ 6461 static inline QEMU_ALWAYS_INLINE 6462 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 6463 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 6464 const int esz, const int msz, const SVEContFault fault, 6465 sve_ldst1_host_fn *host_fn, 6466 sve_ldst1_tlb_fn *tlb_fn) 6467 { 6468 const unsigned rd = simd_data(desc); 6469 void *vd = &env->vfp.zregs[rd]; 6470 const intptr_t reg_max = simd_oprsz(desc); 6471 intptr_t reg_off, mem_off, reg_last; 6472 SVEContLdSt info; 6473 int flags; 6474 void *host; 6475 6476 /* Find the active elements. */ 6477 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 6478 /* The entire predicate was false; no load occurs. */ 6479 memset(vd, 0, reg_max); 6480 return; 6481 } 6482 reg_off = info.reg_off_first[0]; 6483 6484 /* Probe the page(s). */ 6485 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6486 /* Fault on first element. */ 6487 tcg_debug_assert(fault == FAULT_NO); 6488 memset(vd, 0, reg_max); 6489 goto do_fault; 6490 } 6491 6492 mem_off = info.mem_off_first[0]; 6493 flags = info.page[0].flags; 6494 6495 /* 6496 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6497 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6498 */ 6499 if (!info.page[0].tagged) { 6500 mtedesc = 0; 6501 } 6502 6503 if (fault == FAULT_FIRST) { 6504 /* Trapping mte check for the first-fault element. */ 6505 if (mtedesc) { 6506 mte_check(env, mtedesc, addr + mem_off, retaddr); 6507 } 6508 6509 /* 6510 * Special handling of the first active element, 6511 * if it crosses a page boundary or is MMIO. 6512 */ 6513 bool is_split = mem_off == info.mem_off_split; 6514 if (unlikely(flags != 0) || unlikely(is_split)) { 6515 /* 6516 * Use the slow path for cross-page handling. 6517 * Might trap for MMIO or watchpoints. 6518 */ 6519 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6520 6521 /* After any fault, zero the other elements. */ 6522 swap_memzero(vd, reg_off); 6523 reg_off += 1 << esz; 6524 mem_off += 1 << msz; 6525 swap_memzero(vd + reg_off, reg_max - reg_off); 6526 6527 if (is_split) { 6528 goto second_page; 6529 } 6530 } else { 6531 memset(vd, 0, reg_max); 6532 } 6533 } else { 6534 memset(vd, 0, reg_max); 6535 if (unlikely(mem_off == info.mem_off_split)) { 6536 /* The first active element crosses a page boundary. */ 6537 flags |= info.page[1].flags; 6538 if (unlikely(flags & TLB_MMIO)) { 6539 /* Some page is MMIO, see below. */ 6540 goto do_fault; 6541 } 6542 if (unlikely(flags & TLB_WATCHPOINT) && 6543 (cpu_watchpoint_address_matches 6544 (env_cpu(env), addr + mem_off, 1 << msz) 6545 & BP_MEM_READ)) { 6546 /* Watchpoint hit, see below. */ 6547 goto do_fault; 6548 } 6549 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6550 goto do_fault; 6551 } 6552 /* 6553 * Use the slow path for cross-page handling. 6554 * This is RAM, without a watchpoint, and will not trap. 6555 */ 6556 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6557 goto second_page; 6558 } 6559 } 6560 6561 /* 6562 * From this point on, all memory operations are MemSingleNF. 6563 * 6564 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6565 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6566 * 6567 * Unfortuately we do not have access to the memory attributes from the 6568 * PTE to tell Device memory from Normal memory. So we make a mostly 6569 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6570 * This gives the right answer for the common cases of "Normal memory, 6571 * backed by host RAM" and "Device memory, backed by MMIO". 6572 * The architecture allows us to suppress an NF load and return 6573 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6574 * case of "Normal memory, backed by MMIO" is permitted. The case we 6575 * get wrong is "Device memory, backed by host RAM", for which we 6576 * should return (UNKNOWN, FAULT) for but do not. 6577 * 6578 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6579 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6580 * architectural breakpoints the same. 6581 */ 6582 if (unlikely(flags & TLB_MMIO)) { 6583 goto do_fault; 6584 } 6585 6586 reg_last = info.reg_off_last[0]; 6587 host = info.page[0].host; 6588 6589 set_helper_retaddr(retaddr); 6590 6591 do { 6592 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6593 do { 6594 if ((pg >> (reg_off & 63)) & 1) { 6595 if (unlikely(flags & TLB_WATCHPOINT) && 6596 (cpu_watchpoint_address_matches 6597 (env_cpu(env), addr + mem_off, 1 << msz) 6598 & BP_MEM_READ)) { 6599 clear_helper_retaddr(); 6600 goto do_fault; 6601 } 6602 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6603 clear_helper_retaddr(); 6604 goto do_fault; 6605 } 6606 host_fn(vd, reg_off, host + mem_off); 6607 } 6608 reg_off += 1 << esz; 6609 mem_off += 1 << msz; 6610 } while (reg_off <= reg_last && (reg_off & 63)); 6611 } while (reg_off <= reg_last); 6612 6613 clear_helper_retaddr(); 6614 6615 /* 6616 * MemSingleNF is allowed to fail for any reason. We have special 6617 * code above to handle the first element crossing a page boundary. 6618 * As an implementation choice, decline to handle a cross-page element 6619 * in any other position. 6620 */ 6621 reg_off = info.reg_off_split; 6622 if (reg_off >= 0) { 6623 goto do_fault; 6624 } 6625 6626 second_page: 6627 reg_off = info.reg_off_first[1]; 6628 if (likely(reg_off < 0)) { 6629 /* No active elements on the second page. All done. */ 6630 return; 6631 } 6632 6633 /* 6634 * MemSingleNF is allowed to fail for any reason. As an implementation 6635 * choice, decline to handle elements on the second page. This should 6636 * be low frequency as the guest walks through memory -- the next 6637 * iteration of the guest's loop should be aligned on the page boundary, 6638 * and then all following iterations will stay aligned. 6639 */ 6640 6641 do_fault: 6642 record_fault(env, reg_off, reg_max); 6643 } 6644 6645 static inline QEMU_ALWAYS_INLINE 6646 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6647 uint32_t desc, const uintptr_t retaddr, 6648 const int esz, const int msz, const SVEContFault fault, 6649 sve_ldst1_host_fn *host_fn, 6650 sve_ldst1_tlb_fn *tlb_fn) 6651 { 6652 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6653 int bit55 = extract64(addr, 55, 1); 6654 6655 /* Remove mtedesc from the normal sve descriptor. */ 6656 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6657 6658 /* Perform gross MTE suppression early. */ 6659 if (!tbi_check(mtedesc, bit55) || 6660 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6661 mtedesc = 0; 6662 } 6663 6664 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6665 esz, msz, fault, host_fn, tlb_fn); 6666 } 6667 6668 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6669 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6670 target_ulong addr, uint32_t desc) \ 6671 { \ 6672 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6673 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6674 } \ 6675 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6676 target_ulong addr, uint32_t desc) \ 6677 { \ 6678 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6679 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6680 } \ 6681 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6682 target_ulong addr, uint32_t desc) \ 6683 { \ 6684 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6685 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6686 } \ 6687 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6688 target_ulong addr, uint32_t desc) \ 6689 { \ 6690 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6691 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6692 } 6693 6694 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6695 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6696 target_ulong addr, uint32_t desc) \ 6697 { \ 6698 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6699 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6700 } \ 6701 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6702 target_ulong addr, uint32_t desc) \ 6703 { \ 6704 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6705 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6706 } \ 6707 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6708 target_ulong addr, uint32_t desc) \ 6709 { \ 6710 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6711 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6712 } \ 6713 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6714 target_ulong addr, uint32_t desc) \ 6715 { \ 6716 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6717 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6718 } \ 6719 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6720 target_ulong addr, uint32_t desc) \ 6721 { \ 6722 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6723 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6724 } \ 6725 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6726 target_ulong addr, uint32_t desc) \ 6727 { \ 6728 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6729 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6730 } \ 6731 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6732 target_ulong addr, uint32_t desc) \ 6733 { \ 6734 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6735 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6736 } \ 6737 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6738 target_ulong addr, uint32_t desc) \ 6739 { \ 6740 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6741 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6742 } 6743 6744 DO_LDFF1_LDNF1_1(bb, MO_8) 6745 DO_LDFF1_LDNF1_1(bhu, MO_16) 6746 DO_LDFF1_LDNF1_1(bhs, MO_16) 6747 DO_LDFF1_LDNF1_1(bsu, MO_32) 6748 DO_LDFF1_LDNF1_1(bss, MO_32) 6749 DO_LDFF1_LDNF1_1(bdu, MO_64) 6750 DO_LDFF1_LDNF1_1(bds, MO_64) 6751 6752 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6753 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6754 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6755 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6756 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6757 6758 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6759 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6760 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6761 6762 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6763 6764 #undef DO_LDFF1_LDNF1_1 6765 #undef DO_LDFF1_LDNF1_2 6766 6767 /* 6768 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6769 */ 6770 6771 static inline QEMU_ALWAYS_INLINE 6772 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6773 uint32_t desc, const uintptr_t retaddr, 6774 const int esz, const int msz, const int N, uint32_t mtedesc, 6775 sve_ldst1_host_fn *host_fn, 6776 sve_ldst1_tlb_fn *tlb_fn) 6777 { 6778 const unsigned rd = simd_data(desc); 6779 const intptr_t reg_max = simd_oprsz(desc); 6780 intptr_t reg_off, reg_last, mem_off; 6781 SVEContLdSt info; 6782 void *host; 6783 int i, flags; 6784 6785 /* Find the active elements. */ 6786 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6787 /* The entire predicate was false; no store occurs. */ 6788 return; 6789 } 6790 6791 /* Probe the page(s). Exit with exception for any invalid page. */ 6792 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6793 6794 /* Handle watchpoints for all active elements. */ 6795 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6796 BP_MEM_WRITE, retaddr); 6797 6798 /* 6799 * Handle mte checks for all active elements. 6800 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6801 */ 6802 if (mtedesc) { 6803 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6804 mtedesc, retaddr); 6805 } 6806 6807 flags = info.page[0].flags | info.page[1].flags; 6808 if (unlikely(flags != 0)) { 6809 /* 6810 * At least one page includes MMIO. 6811 * Any bus operation can fail with cpu_transaction_failed, 6812 * which for ARM will raise SyncExternal. We cannot avoid 6813 * this fault and will leave with the store incomplete. 6814 */ 6815 mem_off = info.mem_off_first[0]; 6816 reg_off = info.reg_off_first[0]; 6817 reg_last = info.reg_off_last[1]; 6818 if (reg_last < 0) { 6819 reg_last = info.reg_off_split; 6820 if (reg_last < 0) { 6821 reg_last = info.reg_off_last[0]; 6822 } 6823 } 6824 6825 do { 6826 uint64_t pg = vg[reg_off >> 6]; 6827 do { 6828 if ((pg >> (reg_off & 63)) & 1) { 6829 for (i = 0; i < N; ++i) { 6830 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6831 addr + mem_off + (i << msz), retaddr); 6832 } 6833 } 6834 reg_off += 1 << esz; 6835 mem_off += N << msz; 6836 } while (reg_off & 63); 6837 } while (reg_off <= reg_last); 6838 return; 6839 } 6840 6841 mem_off = info.mem_off_first[0]; 6842 reg_off = info.reg_off_first[0]; 6843 reg_last = info.reg_off_last[0]; 6844 host = info.page[0].host; 6845 6846 set_helper_retaddr(retaddr); 6847 6848 while (reg_off <= reg_last) { 6849 uint64_t pg = vg[reg_off >> 6]; 6850 do { 6851 if ((pg >> (reg_off & 63)) & 1) { 6852 for (i = 0; i < N; ++i) { 6853 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6854 host + mem_off + (i << msz)); 6855 } 6856 } 6857 reg_off += 1 << esz; 6858 mem_off += N << msz; 6859 } while (reg_off <= reg_last && (reg_off & 63)); 6860 } 6861 6862 clear_helper_retaddr(); 6863 6864 /* 6865 * Use the slow path to manage the cross-page misalignment. 6866 * But we know this is RAM and cannot trap. 6867 */ 6868 mem_off = info.mem_off_split; 6869 if (unlikely(mem_off >= 0)) { 6870 reg_off = info.reg_off_split; 6871 for (i = 0; i < N; ++i) { 6872 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6873 addr + mem_off + (i << msz), retaddr); 6874 } 6875 } 6876 6877 mem_off = info.mem_off_first[1]; 6878 if (unlikely(mem_off >= 0)) { 6879 reg_off = info.reg_off_first[1]; 6880 reg_last = info.reg_off_last[1]; 6881 host = info.page[1].host; 6882 6883 set_helper_retaddr(retaddr); 6884 6885 do { 6886 uint64_t pg = vg[reg_off >> 6]; 6887 do { 6888 if ((pg >> (reg_off & 63)) & 1) { 6889 for (i = 0; i < N; ++i) { 6890 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6891 host + mem_off + (i << msz)); 6892 } 6893 } 6894 reg_off += 1 << esz; 6895 mem_off += N << msz; 6896 } while (reg_off & 63); 6897 } while (reg_off <= reg_last); 6898 6899 clear_helper_retaddr(); 6900 } 6901 } 6902 6903 static inline QEMU_ALWAYS_INLINE 6904 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6905 uint32_t desc, const uintptr_t ra, 6906 const int esz, const int msz, const int N, 6907 sve_ldst1_host_fn *host_fn, 6908 sve_ldst1_tlb_fn *tlb_fn) 6909 { 6910 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6911 int bit55 = extract64(addr, 55, 1); 6912 6913 /* Remove mtedesc from the normal sve descriptor. */ 6914 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6915 6916 /* Perform gross MTE suppression early. */ 6917 if (!tbi_check(mtedesc, bit55) || 6918 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6919 mtedesc = 0; 6920 } 6921 6922 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6923 } 6924 6925 #define DO_STN_1(N, NAME, ESZ) \ 6926 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6927 target_ulong addr, uint32_t desc) \ 6928 { \ 6929 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6930 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6931 } \ 6932 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6933 target_ulong addr, uint32_t desc) \ 6934 { \ 6935 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6936 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6937 } 6938 6939 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6940 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6941 target_ulong addr, uint32_t desc) \ 6942 { \ 6943 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6944 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6945 } \ 6946 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6947 target_ulong addr, uint32_t desc) \ 6948 { \ 6949 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6950 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6951 } \ 6952 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6953 target_ulong addr, uint32_t desc) \ 6954 { \ 6955 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6956 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6957 } \ 6958 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6959 target_ulong addr, uint32_t desc) \ 6960 { \ 6961 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6962 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6963 } 6964 6965 DO_STN_1(1, bb, MO_8) 6966 DO_STN_1(1, bh, MO_16) 6967 DO_STN_1(1, bs, MO_32) 6968 DO_STN_1(1, bd, MO_64) 6969 DO_STN_1(2, bb, MO_8) 6970 DO_STN_1(3, bb, MO_8) 6971 DO_STN_1(4, bb, MO_8) 6972 6973 DO_STN_2(1, hh, MO_16, MO_16) 6974 DO_STN_2(1, hs, MO_32, MO_16) 6975 DO_STN_2(1, hd, MO_64, MO_16) 6976 DO_STN_2(2, hh, MO_16, MO_16) 6977 DO_STN_2(3, hh, MO_16, MO_16) 6978 DO_STN_2(4, hh, MO_16, MO_16) 6979 6980 DO_STN_2(1, ss, MO_32, MO_32) 6981 DO_STN_2(1, sd, MO_64, MO_32) 6982 DO_STN_2(2, ss, MO_32, MO_32) 6983 DO_STN_2(3, ss, MO_32, MO_32) 6984 DO_STN_2(4, ss, MO_32, MO_32) 6985 6986 DO_STN_2(1, dd, MO_64, MO_64) 6987 DO_STN_2(2, dd, MO_64, MO_64) 6988 DO_STN_2(3, dd, MO_64, MO_64) 6989 DO_STN_2(4, dd, MO_64, MO_64) 6990 6991 DO_STN_2(1, sq, MO_128, MO_32) 6992 DO_STN_2(1, dq, MO_128, MO_64) 6993 6994 DO_STN_2(2, qq, MO_128, MO_128) 6995 DO_STN_2(3, qq, MO_128, MO_128) 6996 DO_STN_2(4, qq, MO_128, MO_128) 6997 6998 #undef DO_STN_1 6999 #undef DO_STN_2 7000 7001 /* 7002 * Loads with a vector index. 7003 */ 7004 7005 /* 7006 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 7007 */ 7008 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 7009 7010 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 7011 { 7012 return *(uint32_t *)(reg + H1_4(reg_ofs)); 7013 } 7014 7015 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 7016 { 7017 return *(int32_t *)(reg + H1_4(reg_ofs)); 7018 } 7019 7020 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 7021 { 7022 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 7023 } 7024 7025 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 7026 { 7027 return (int32_t)*(uint64_t *)(reg + reg_ofs); 7028 } 7029 7030 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 7031 { 7032 return *(uint64_t *)(reg + reg_ofs); 7033 } 7034 7035 static inline QEMU_ALWAYS_INLINE 7036 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7037 target_ulong base, uint32_t desc, uintptr_t retaddr, 7038 uint32_t mtedesc, int esize, int msize, 7039 zreg_off_fn *off_fn, 7040 sve_ldst1_host_fn *host_fn, 7041 sve_ldst1_tlb_fn *tlb_fn) 7042 { 7043 const int mmu_idx = arm_env_mmu_index(env); 7044 const intptr_t reg_max = simd_oprsz(desc); 7045 const int scale = simd_data(desc); 7046 ARMVectorReg scratch; 7047 intptr_t reg_off; 7048 SVEHostPage info, info2; 7049 7050 memset(&scratch, 0, reg_max); 7051 reg_off = 0; 7052 do { 7053 uint64_t pg = vg[reg_off >> 6]; 7054 do { 7055 if (likely(pg & 1)) { 7056 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7057 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7058 7059 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 7060 mmu_idx, retaddr); 7061 7062 if (likely(in_page >= msize)) { 7063 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7064 cpu_check_watchpoint(env_cpu(env), addr, msize, 7065 info.attrs, BP_MEM_READ, retaddr); 7066 } 7067 if (mtedesc && info.tagged) { 7068 mte_check(env, mtedesc, addr, retaddr); 7069 } 7070 if (unlikely(info.flags & TLB_MMIO)) { 7071 tlb_fn(env, &scratch, reg_off, addr, retaddr); 7072 } else { 7073 set_helper_retaddr(retaddr); 7074 host_fn(&scratch, reg_off, info.host); 7075 clear_helper_retaddr(); 7076 } 7077 } else { 7078 /* Element crosses the page boundary. */ 7079 sve_probe_page(&info2, false, env, addr + in_page, 0, 7080 MMU_DATA_LOAD, mmu_idx, retaddr); 7081 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 7082 cpu_check_watchpoint(env_cpu(env), addr, 7083 msize, info.attrs, 7084 BP_MEM_READ, retaddr); 7085 } 7086 if (mtedesc && info.tagged) { 7087 mte_check(env, mtedesc, addr, retaddr); 7088 } 7089 tlb_fn(env, &scratch, reg_off, addr, retaddr); 7090 } 7091 } 7092 reg_off += esize; 7093 pg >>= esize; 7094 } while (reg_off & 63); 7095 } while (reg_off < reg_max); 7096 7097 /* Wait until all exceptions have been raised to write back. */ 7098 memcpy(vd, &scratch, reg_max); 7099 } 7100 7101 static inline QEMU_ALWAYS_INLINE 7102 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7103 target_ulong base, uint32_t desc, uintptr_t retaddr, 7104 int esize, int msize, zreg_off_fn *off_fn, 7105 sve_ldst1_host_fn *host_fn, 7106 sve_ldst1_tlb_fn *tlb_fn) 7107 { 7108 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7109 /* Remove mtedesc from the normal sve descriptor. */ 7110 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7111 7112 /* 7113 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7114 * offset base entirely over the address space hole to change the 7115 * pointer tag, or change the bit55 selector. So we could here 7116 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7117 */ 7118 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7119 esize, msize, off_fn, host_fn, tlb_fn); 7120 } 7121 7122 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 7123 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7124 void *vm, target_ulong base, uint32_t desc) \ 7125 { \ 7126 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7127 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7128 } \ 7129 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7130 void *vm, target_ulong base, uint32_t desc) \ 7131 { \ 7132 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7133 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7134 } 7135 7136 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 7137 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7138 void *vm, target_ulong base, uint32_t desc) \ 7139 { \ 7140 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7141 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7142 } \ 7143 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7144 void *vm, target_ulong base, uint32_t desc) \ 7145 { \ 7146 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7147 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7148 } 7149 7150 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 7151 DO_LD1_ZPZ_S(bsu, zss, MO_8) 7152 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 7153 DO_LD1_ZPZ_D(bdu, zss, MO_8) 7154 DO_LD1_ZPZ_D(bdu, zd, MO_8) 7155 7156 DO_LD1_ZPZ_S(bss, zsu, MO_8) 7157 DO_LD1_ZPZ_S(bss, zss, MO_8) 7158 DO_LD1_ZPZ_D(bds, zsu, MO_8) 7159 DO_LD1_ZPZ_D(bds, zss, MO_8) 7160 DO_LD1_ZPZ_D(bds, zd, MO_8) 7161 7162 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 7163 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 7164 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 7165 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 7166 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 7167 7168 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 7169 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 7170 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 7171 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 7172 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 7173 7174 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 7175 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 7176 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 7177 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 7178 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 7179 7180 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 7181 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 7182 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 7183 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 7184 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 7185 7186 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 7187 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 7188 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 7189 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 7190 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 7191 7192 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 7193 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 7194 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 7195 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 7196 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 7197 7198 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 7199 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 7200 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 7201 7202 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 7203 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 7204 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 7205 7206 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 7207 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 7208 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 7209 7210 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 7211 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 7212 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 7213 7214 DO_LD1_ZPZ_D(qq_le, zd, MO_128) 7215 DO_LD1_ZPZ_D(qq_be, zd, MO_128) 7216 7217 #undef DO_LD1_ZPZ_S 7218 #undef DO_LD1_ZPZ_D 7219 7220 /* First fault loads with a vector index. */ 7221 7222 /* 7223 * Common helpers for all gather first-faulting loads. 7224 */ 7225 7226 static inline QEMU_ALWAYS_INLINE 7227 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7228 target_ulong base, uint32_t desc, uintptr_t retaddr, 7229 uint32_t mtedesc, const int esz, const int msz, 7230 zreg_off_fn *off_fn, 7231 sve_ldst1_host_fn *host_fn, 7232 sve_ldst1_tlb_fn *tlb_fn) 7233 { 7234 const int mmu_idx = arm_env_mmu_index(env); 7235 const intptr_t reg_max = simd_oprsz(desc); 7236 const int scale = simd_data(desc); 7237 const int esize = 1 << esz; 7238 const int msize = 1 << msz; 7239 intptr_t reg_off; 7240 SVEHostPage info; 7241 target_ulong addr, in_page; 7242 ARMVectorReg scratch; 7243 7244 /* Skip to the first true predicate. */ 7245 reg_off = find_next_active(vg, 0, reg_max, esz); 7246 if (unlikely(reg_off >= reg_max)) { 7247 /* The entire predicate was false; no load occurs. */ 7248 memset(vd, 0, reg_max); 7249 return; 7250 } 7251 7252 /* Protect against overlap between vd and vm. */ 7253 if (unlikely(vd == vm)) { 7254 vm = memcpy(&scratch, vm, reg_max); 7255 } 7256 7257 /* 7258 * Probe the first element, allowing faults. 7259 */ 7260 addr = base + (off_fn(vm, reg_off) << scale); 7261 if (mtedesc) { 7262 mte_check(env, mtedesc, addr, retaddr); 7263 } 7264 tlb_fn(env, vd, reg_off, addr, retaddr); 7265 7266 /* After any fault, zero the other elements. */ 7267 swap_memzero(vd, reg_off); 7268 reg_off += esize; 7269 swap_memzero(vd + reg_off, reg_max - reg_off); 7270 7271 /* 7272 * Probe the remaining elements, not allowing faults. 7273 */ 7274 while (reg_off < reg_max) { 7275 uint64_t pg = vg[reg_off >> 6]; 7276 do { 7277 if (likely((pg >> (reg_off & 63)) & 1)) { 7278 addr = base + (off_fn(vm, reg_off) << scale); 7279 in_page = -(addr | TARGET_PAGE_MASK); 7280 7281 if (unlikely(in_page < msize)) { 7282 /* Stop if the element crosses a page boundary. */ 7283 goto fault; 7284 } 7285 7286 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 7287 mmu_idx, retaddr); 7288 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 7289 goto fault; 7290 } 7291 if (unlikely(info.flags & TLB_WATCHPOINT) && 7292 (cpu_watchpoint_address_matches 7293 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 7294 goto fault; 7295 } 7296 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 7297 goto fault; 7298 } 7299 7300 set_helper_retaddr(retaddr); 7301 host_fn(vd, reg_off, info.host); 7302 clear_helper_retaddr(); 7303 } 7304 reg_off += esize; 7305 } while (reg_off & 63); 7306 } 7307 return; 7308 7309 fault: 7310 record_fault(env, reg_off, reg_max); 7311 } 7312 7313 static inline QEMU_ALWAYS_INLINE 7314 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7315 target_ulong base, uint32_t desc, uintptr_t retaddr, 7316 const int esz, const int msz, 7317 zreg_off_fn *off_fn, 7318 sve_ldst1_host_fn *host_fn, 7319 sve_ldst1_tlb_fn *tlb_fn) 7320 { 7321 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7322 /* Remove mtedesc from the normal sve descriptor. */ 7323 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7324 7325 /* 7326 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7327 * offset base entirely over the address space hole to change the 7328 * pointer tag, or change the bit55 selector. So we could here 7329 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7330 */ 7331 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7332 esz, msz, off_fn, host_fn, tlb_fn); 7333 } 7334 7335 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 7336 void HELPER(sve_ldff##MEM##_##OFS) \ 7337 (CPUARMState *env, void *vd, void *vg, \ 7338 void *vm, target_ulong base, uint32_t desc) \ 7339 { \ 7340 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 7341 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7342 } \ 7343 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7344 (CPUARMState *env, void *vd, void *vg, \ 7345 void *vm, target_ulong base, uint32_t desc) \ 7346 { \ 7347 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 7348 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7349 } 7350 7351 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 7352 void HELPER(sve_ldff##MEM##_##OFS) \ 7353 (CPUARMState *env, void *vd, void *vg, \ 7354 void *vm, target_ulong base, uint32_t desc) \ 7355 { \ 7356 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 7357 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7358 } \ 7359 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7360 (CPUARMState *env, void *vd, void *vg, \ 7361 void *vm, target_ulong base, uint32_t desc) \ 7362 { \ 7363 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 7364 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7365 } 7366 7367 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 7368 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 7369 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 7370 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 7371 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 7372 7373 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 7374 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 7375 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 7376 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 7377 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 7378 7379 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 7380 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 7381 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 7382 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 7383 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 7384 7385 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 7386 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 7387 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 7388 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 7389 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 7390 7391 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 7392 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 7393 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 7394 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 7395 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 7396 7397 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 7398 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 7399 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 7400 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 7401 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 7402 7403 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 7404 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 7405 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 7406 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 7407 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 7408 7409 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 7410 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 7411 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 7412 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 7413 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 7414 7415 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 7416 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 7417 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 7418 7419 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 7420 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 7421 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 7422 7423 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 7424 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 7425 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 7426 7427 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 7428 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 7429 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 7430 7431 /* Stores with a vector index. */ 7432 7433 static inline QEMU_ALWAYS_INLINE 7434 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7435 target_ulong base, uint32_t desc, uintptr_t retaddr, 7436 uint32_t mtedesc, int esize, int msize, 7437 zreg_off_fn *off_fn, 7438 sve_ldst1_host_fn *host_fn, 7439 sve_ldst1_tlb_fn *tlb_fn) 7440 { 7441 const int mmu_idx = arm_env_mmu_index(env); 7442 const intptr_t reg_max = simd_oprsz(desc); 7443 const int scale = simd_data(desc); 7444 void *host[ARM_MAX_VQ * 4]; 7445 intptr_t reg_off, i; 7446 SVEHostPage info, info2; 7447 7448 /* 7449 * Probe all of the elements for host addresses and flags. 7450 */ 7451 i = reg_off = 0; 7452 do { 7453 uint64_t pg = vg[reg_off >> 6]; 7454 do { 7455 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7456 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7457 7458 host[i] = NULL; 7459 if (likely((pg >> (reg_off & 63)) & 1)) { 7460 if (likely(in_page >= msize)) { 7461 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 7462 mmu_idx, retaddr); 7463 if (!(info.flags & TLB_MMIO)) { 7464 host[i] = info.host; 7465 } 7466 } else { 7467 /* 7468 * Element crosses the page boundary. 7469 * Probe both pages, but do not record the host address, 7470 * so that we use the slow path. 7471 */ 7472 sve_probe_page(&info, false, env, addr, 0, 7473 MMU_DATA_STORE, mmu_idx, retaddr); 7474 sve_probe_page(&info2, false, env, addr + in_page, 0, 7475 MMU_DATA_STORE, mmu_idx, retaddr); 7476 info.flags |= info2.flags; 7477 } 7478 7479 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7480 cpu_check_watchpoint(env_cpu(env), addr, msize, 7481 info.attrs, BP_MEM_WRITE, retaddr); 7482 } 7483 7484 if (mtedesc && info.tagged) { 7485 mte_check(env, mtedesc, addr, retaddr); 7486 } 7487 } 7488 i += 1; 7489 reg_off += esize; 7490 } while (reg_off & 63); 7491 } while (reg_off < reg_max); 7492 7493 /* 7494 * Now that we have recognized all exceptions except SyncExternal 7495 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7496 * 7497 * Note for the common case of an element in RAM, not crossing a page 7498 * boundary, we have stored the host address in host[]. This doubles 7499 * as a first-level check against the predicate, since only enabled 7500 * elements have non-null host addresses. 7501 */ 7502 i = reg_off = 0; 7503 do { 7504 void *h = host[i]; 7505 if (likely(h != NULL)) { 7506 set_helper_retaddr(retaddr); 7507 host_fn(vd, reg_off, h); 7508 clear_helper_retaddr(); 7509 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7510 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7511 tlb_fn(env, vd, reg_off, addr, retaddr); 7512 } 7513 i += 1; 7514 reg_off += esize; 7515 } while (reg_off < reg_max); 7516 } 7517 7518 static inline QEMU_ALWAYS_INLINE 7519 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7520 target_ulong base, uint32_t desc, uintptr_t retaddr, 7521 int esize, int msize, zreg_off_fn *off_fn, 7522 sve_ldst1_host_fn *host_fn, 7523 sve_ldst1_tlb_fn *tlb_fn) 7524 { 7525 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7526 /* Remove mtedesc from the normal sve descriptor. */ 7527 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7528 7529 /* 7530 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7531 * offset base entirely over the address space hole to change the 7532 * pointer tag, or change the bit55 selector. So we could here 7533 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7534 */ 7535 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7536 esize, msize, off_fn, host_fn, tlb_fn); 7537 } 7538 7539 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7540 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7541 void *vm, target_ulong base, uint32_t desc) \ 7542 { \ 7543 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7544 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7545 } \ 7546 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7547 void *vm, target_ulong base, uint32_t desc) \ 7548 { \ 7549 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7550 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7551 } 7552 7553 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7554 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7555 void *vm, target_ulong base, uint32_t desc) \ 7556 { \ 7557 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7558 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7559 } \ 7560 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7561 void *vm, target_ulong base, uint32_t desc) \ 7562 { \ 7563 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7564 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7565 } 7566 7567 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7568 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7569 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7570 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7571 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7572 7573 DO_ST1_ZPZ_S(bs, zss, MO_8) 7574 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7575 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7576 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7577 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7578 7579 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7580 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7581 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7582 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7583 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7584 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7585 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7586 7587 DO_ST1_ZPZ_D(bd, zss, MO_8) 7588 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7589 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7590 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7591 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7592 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7593 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7594 7595 DO_ST1_ZPZ_D(bd, zd, MO_8) 7596 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7597 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7598 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7599 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7600 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7601 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7602 7603 DO_ST1_ZPZ_D(qq_le, zd, MO_128) 7604 DO_ST1_ZPZ_D(qq_be, zd, MO_128) 7605 7606 #undef DO_ST1_ZPZ_S 7607 #undef DO_ST1_ZPZ_D 7608 7609 /* 7610 * SVE2.1 consecutive register load/store 7611 */ 7612 7613 static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr, 7614 uint32_t png, intptr_t reg_max, 7615 int N, int v_esz) 7616 { 7617 const int esize = 1 << v_esz; 7618 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 7619 DecodeCounter p = decode_counter(png, reg_max, v_esz); 7620 unsigned b_count = p.count << v_esz; 7621 unsigned b_stride = 1 << (v_esz + p.lg2_stride); 7622 intptr_t page_split; 7623 7624 /* Set all of the element indices to -1, and the TLB data to 0. */ 7625 memset(info, -1, offsetof(SVEContLdSt, page)); 7626 memset(info->page, 0, sizeof(info->page)); 7627 7628 if (p.invert) { 7629 if (b_count >= reg_max * N) { 7630 return 0; 7631 } 7632 reg_off_first = b_count; 7633 reg_off_last = reg_max * N - b_stride; 7634 } else { 7635 if (b_count == 0) { 7636 return 0; 7637 } 7638 reg_off_first = 0; 7639 reg_off_last = MIN(b_count - esize, reg_max * N - b_stride); 7640 } 7641 7642 info->reg_off_first[0] = reg_off_first; 7643 info->mem_off_first[0] = reg_off_first; 7644 7645 page_split = -(addr | TARGET_PAGE_MASK); 7646 if (reg_off_last + esize <= page_split || reg_off_first >= page_split) { 7647 /* The entire operation fits within a single page. */ 7648 info->reg_off_last[0] = reg_off_last; 7649 return b_stride; 7650 } 7651 7652 info->page_split = page_split; 7653 reg_off_split = ROUND_DOWN(page_split, esize); 7654 7655 /* 7656 * This is the last full element on the first page, but it is not 7657 * necessarily active. If there is no full element, i.e. the first 7658 * active element is the one that's split, this value remains -1. 7659 * It is useful as iteration bounds. 7660 */ 7661 if (reg_off_split != 0) { 7662 info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride); 7663 } 7664 7665 /* Determine if an unaligned element spans the pages. */ 7666 if (page_split & (esize - 1)) { 7667 /* It is helpful to know if the split element is active. */ 7668 if ((reg_off_split & (b_stride - 1)) == 0) { 7669 info->reg_off_split = reg_off_split; 7670 info->mem_off_split = reg_off_split; 7671 } 7672 reg_off_split += esize; 7673 } 7674 7675 /* 7676 * We do want the first active element on the second page, because 7677 * this may affect the address reported in an exception. 7678 */ 7679 reg_off_split = ROUND_UP(reg_off_split, b_stride); 7680 if (reg_off_split <= reg_off_last) { 7681 info->reg_off_first[1] = reg_off_split; 7682 info->mem_off_first[1] = reg_off_split; 7683 info->reg_off_last[1] = reg_off_last; 7684 } 7685 return b_stride; 7686 } 7687 7688 static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 7689 target_ulong addr, unsigned estride, 7690 int esize, int wp_access, uintptr_t ra) 7691 { 7692 #ifndef CONFIG_USER_ONLY 7693 intptr_t count_off, count_last; 7694 int flags0 = info->page[0].flags; 7695 int flags1 = info->page[1].flags; 7696 7697 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 7698 return; 7699 } 7700 7701 /* Indicate that watchpoints are handled. */ 7702 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 7703 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 7704 7705 if (flags0 & TLB_WATCHPOINT) { 7706 count_off = info->reg_off_first[0]; 7707 count_last = info->reg_off_split; 7708 if (count_last < 0) { 7709 count_last = info->reg_off_last[0]; 7710 } 7711 do { 7712 cpu_check_watchpoint(env_cpu(env), addr + count_off, 7713 esize, info->page[0].attrs, wp_access, ra); 7714 count_off += estride; 7715 } while (count_off <= count_last); 7716 } 7717 7718 count_off = info->reg_off_first[1]; 7719 if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) { 7720 count_last = info->reg_off_last[1]; 7721 do { 7722 cpu_check_watchpoint(env_cpu(env), addr + count_off, 7723 esize, info->page[1].attrs, 7724 wp_access, ra); 7725 count_off += estride; 7726 } while (count_off <= count_last); 7727 } 7728 #endif 7729 } 7730 7731 static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 7732 target_ulong addr, unsigned estride, 7733 int esize, uint32_t mtedesc, 7734 uintptr_t ra) 7735 { 7736 intptr_t count_off, count_last; 7737 7738 /* 7739 * TODO: estride is always a small power of two, <= 8. 7740 * Manipulate the stride within the loops such that 7741 * - first iteration hits addr + off, as required, 7742 * - second iteration hits ALIGN_UP(addr, 16), 7743 * - other iterations advance addr by 16. 7744 * This will minimize the probing to once per MTE granule. 7745 */ 7746 7747 /* Process the page only if MemAttr == Tagged. */ 7748 if (info->page[0].tagged) { 7749 count_off = info->reg_off_first[0]; 7750 count_last = info->reg_off_split; 7751 if (count_last < 0) { 7752 count_last = info->reg_off_last[0]; 7753 } 7754 7755 do { 7756 mte_check(env, mtedesc, addr + count_off, ra); 7757 count_off += estride; 7758 } while (count_off <= count_last); 7759 } 7760 7761 count_off = info->reg_off_first[1]; 7762 if (count_off >= 0 && info->page[1].tagged) { 7763 count_last = info->reg_off_last[1]; 7764 do { 7765 mte_check(env, mtedesc, addr + count_off, ra); 7766 count_off += estride; 7767 } while (count_off <= count_last); 7768 } 7769 } 7770 7771 static inline QEMU_ALWAYS_INLINE 7772 void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, 7773 uint32_t png, uint32_t desc, 7774 const uintptr_t ra, const MemOp esz, 7775 sve_ldst1_host_fn *host_fn, 7776 sve_ldst1_tlb_fn *tlb_fn) 7777 { 7778 const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; 7779 const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); 7780 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7781 const intptr_t reg_max = simd_oprsz(desc); 7782 const unsigned esize = 1 << esz; 7783 intptr_t count_off, count_last; 7784 intptr_t reg_off, reg_last, reg_n; 7785 SVEContLdSt info; 7786 unsigned estride, flags; 7787 void *host; 7788 7789 estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); 7790 if (estride == 0) { 7791 /* The entire predicate was false; no load occurs. */ 7792 for (unsigned n = 0; n < N; n++) { 7793 memset(zd + n * rstride, 0, reg_max); 7794 } 7795 return; 7796 } 7797 7798 /* Probe the page(s). Exit with exception for any invalid page. */ 7799 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra); 7800 7801 /* Handle watchpoints for all active elements. */ 7802 sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, 7803 esize, BP_MEM_READ, ra); 7804 7805 /* 7806 * Handle mte checks for all active elements. 7807 * Since TBI must be set for MTE, !mtedesc => !mte_active. 7808 */ 7809 if (mtedesc) { 7810 sve2p1_cont_ldst_mte_check(&info, env, estride, addr, 7811 esize, mtedesc, ra); 7812 } 7813 7814 flags = info.page[0].flags | info.page[1].flags; 7815 if (unlikely(flags != 0)) { 7816 /* 7817 * At least one page includes MMIO. 7818 * Any bus operation can fail with cpu_transaction_failed, 7819 * which for ARM will raise SyncExternal. Perform the load 7820 * into scratch memory to preserve register state until the end. 7821 */ 7822 ARMVectorReg scratch[4] = { }; 7823 7824 count_off = info.reg_off_first[0]; 7825 count_last = info.reg_off_last[1]; 7826 if (count_last < 0) { 7827 count_last = info.reg_off_split; 7828 if (count_last < 0) { 7829 count_last = info.reg_off_last[0]; 7830 } 7831 } 7832 reg_off = count_off % reg_max; 7833 reg_n = count_off / reg_max; 7834 7835 do { 7836 reg_last = MIN(count_last - count_off, reg_max - esize); 7837 do { 7838 tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra); 7839 reg_off += estride; 7840 count_off += estride; 7841 } while (reg_off <= reg_last); 7842 reg_off = 0; 7843 reg_n++; 7844 } while (count_off <= count_last); 7845 7846 for (unsigned n = 0; n < N; ++n) { 7847 memcpy(&zd[n * rstride], &scratch[n], reg_max); 7848 } 7849 return; 7850 } 7851 7852 /* The entire operation is in RAM, on valid pages. */ 7853 7854 for (unsigned n = 0; n < N; ++n) { 7855 memset(&zd[n * rstride], 0, reg_max); 7856 } 7857 7858 count_off = info.reg_off_first[0]; 7859 count_last = info.reg_off_last[0]; 7860 reg_off = count_off % reg_max; 7861 reg_n = count_off / reg_max; 7862 host = info.page[0].host; 7863 7864 set_helper_retaddr(ra); 7865 7866 do { 7867 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 7868 do { 7869 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 7870 reg_off += estride; 7871 count_off += estride; 7872 } while (reg_off <= reg_last); 7873 reg_off = 0; 7874 reg_n++; 7875 } while (count_off <= count_last); 7876 7877 clear_helper_retaddr(); 7878 7879 /* 7880 * Use the slow path to manage the cross-page misalignment. 7881 * But we know this is RAM and cannot trap. 7882 */ 7883 count_off = info.reg_off_split; 7884 if (unlikely(count_off >= 0)) { 7885 reg_off = count_off % reg_max; 7886 reg_n = count_off / reg_max; 7887 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 7888 } 7889 7890 count_off = info.reg_off_first[1]; 7891 if (unlikely(count_off >= 0)) { 7892 count_last = info.reg_off_last[1]; 7893 reg_off = count_off % reg_max; 7894 reg_n = count_off / reg_max; 7895 host = info.page[1].host; 7896 7897 set_helper_retaddr(ra); 7898 7899 do { 7900 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 7901 do { 7902 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 7903 reg_off += estride; 7904 count_off += estride; 7905 } while (reg_off <= reg_last); 7906 reg_off = 0; 7907 reg_n++; 7908 } while (count_off <= count_last); 7909 7910 clear_helper_retaddr(); 7911 } 7912 } 7913 7914 void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr, 7915 uint32_t png, uint32_t desc) 7916 { 7917 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8, 7918 sve_ld1bb_host, sve_ld1bb_tlb); 7919 } 7920 7921 #define DO_LD1_2(NAME, ESZ) \ 7922 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ 7923 target_ulong addr, uint32_t png, \ 7924 uint32_t desc) \ 7925 { \ 7926 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 7927 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 7928 } \ 7929 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ 7930 target_ulong addr, uint32_t png, \ 7931 uint32_t desc) \ 7932 { \ 7933 sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 7934 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 7935 } 7936 7937 DO_LD1_2(ld1hh, MO_16) 7938 DO_LD1_2(ld1ss, MO_32) 7939 DO_LD1_2(ld1dd, MO_64) 7940 7941 #undef DO_LD1_2 7942 7943 static inline QEMU_ALWAYS_INLINE 7944 void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr, 7945 uint32_t png, uint32_t desc, 7946 const uintptr_t ra, const int esz, 7947 sve_ldst1_host_fn *host_fn, 7948 sve_ldst1_tlb_fn *tlb_fn) 7949 { 7950 const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2; 7951 const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4); 7952 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7953 const intptr_t reg_max = simd_oprsz(desc); 7954 const unsigned esize = 1 << esz; 7955 intptr_t count_off, count_last; 7956 intptr_t reg_off, reg_last, reg_n; 7957 SVEContLdSt info; 7958 unsigned estride, flags; 7959 void *host; 7960 7961 estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz); 7962 if (estride == 0) { 7963 /* The entire predicate was false; no store occurs. */ 7964 return; 7965 } 7966 7967 /* Probe the page(s). Exit with exception for any invalid page. */ 7968 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra); 7969 7970 /* Handle watchpoints for all active elements. */ 7971 sve2p1_cont_ldst_watchpoints(&info, env, addr, estride, 7972 esize, BP_MEM_WRITE, ra); 7973 7974 /* 7975 * Handle mte checks for all active elements. 7976 * Since TBI must be set for MTE, !mtedesc => !mte_active. 7977 */ 7978 if (mtedesc) { 7979 sve2p1_cont_ldst_mte_check(&info, env, estride, addr, 7980 esize, mtedesc, ra); 7981 } 7982 7983 flags = info.page[0].flags | info.page[1].flags; 7984 if (unlikely(flags != 0)) { 7985 /* 7986 * At least one page includes MMIO. 7987 * Any bus operation can fail with cpu_transaction_failed, 7988 * which for ARM will raise SyncExternal. Perform the load 7989 * into scratch memory to preserve register state until the end. 7990 */ 7991 count_off = info.reg_off_first[0]; 7992 count_last = info.reg_off_last[1]; 7993 if (count_last < 0) { 7994 count_last = info.reg_off_split; 7995 if (count_last < 0) { 7996 count_last = info.reg_off_last[0]; 7997 } 7998 } 7999 reg_off = count_off % reg_max; 8000 reg_n = count_off / reg_max; 8001 8002 do { 8003 reg_last = MIN(count_last - count_off, reg_max - esize); 8004 do { 8005 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 8006 reg_off += estride; 8007 count_off += estride; 8008 } while (reg_off <= reg_last); 8009 reg_off = 0; 8010 reg_n++; 8011 } while (count_off <= count_last); 8012 return; 8013 } 8014 8015 /* The entire operation is in RAM, on valid pages. */ 8016 8017 count_off = info.reg_off_first[0]; 8018 count_last = info.reg_off_last[0]; 8019 reg_off = count_off % reg_max; 8020 reg_n = count_off / reg_max; 8021 host = info.page[0].host; 8022 8023 set_helper_retaddr(ra); 8024 8025 do { 8026 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 8027 do { 8028 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 8029 reg_off += estride; 8030 count_off += estride; 8031 } while (reg_off <= reg_last); 8032 reg_off = 0; 8033 reg_n++; 8034 } while (count_off <= count_last); 8035 8036 clear_helper_retaddr(); 8037 8038 /* 8039 * Use the slow path to manage the cross-page misalignment. 8040 * But we know this is RAM and cannot trap. 8041 */ 8042 count_off = info.reg_off_split; 8043 if (unlikely(count_off >= 0)) { 8044 reg_off = count_off % reg_max; 8045 reg_n = count_off / reg_max; 8046 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra); 8047 } 8048 8049 count_off = info.reg_off_first[1]; 8050 if (unlikely(count_off >= 0)) { 8051 count_last = info.reg_off_last[1]; 8052 reg_off = count_off % reg_max; 8053 reg_n = count_off / reg_max; 8054 host = info.page[1].host; 8055 8056 set_helper_retaddr(ra); 8057 8058 do { 8059 reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize); 8060 do { 8061 host_fn(&zd[reg_n * rstride], reg_off, host + count_off); 8062 reg_off += estride; 8063 count_off += estride; 8064 } while (reg_off <= reg_last); 8065 reg_off = 0; 8066 reg_n++; 8067 } while (count_off <= count_last); 8068 8069 clear_helper_retaddr(); 8070 } 8071 } 8072 8073 void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr, 8074 uint32_t png, uint32_t desc) 8075 { 8076 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8, 8077 sve_st1bb_host, sve_st1bb_tlb); 8078 } 8079 8080 #define DO_ST1_2(NAME, ESZ) \ 8081 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd, \ 8082 target_ulong addr, uint32_t png, \ 8083 uint32_t desc) \ 8084 { \ 8085 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8086 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 8087 } \ 8088 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd, \ 8089 target_ulong addr, uint32_t png, \ 8090 uint32_t desc) \ 8091 { \ 8092 sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ, \ 8093 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 8094 } 8095 8096 DO_ST1_2(st1hh, MO_16) 8097 DO_ST1_2(st1ss, MO_32) 8098 DO_ST1_2(st1dd, MO_64) 8099 8100 #undef DO_ST1_2 8101 8102 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8103 { 8104 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8105 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8106 8107 for (i = 0; i < opr_sz; ++i) { 8108 d[i] = n[i] ^ m[i] ^ k[i]; 8109 } 8110 } 8111 8112 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8113 { 8114 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8115 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8116 8117 for (i = 0; i < opr_sz; ++i) { 8118 d[i] = n[i] ^ (m[i] & ~k[i]); 8119 } 8120 } 8121 8122 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8123 { 8124 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8125 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8126 8127 for (i = 0; i < opr_sz; ++i) { 8128 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 8129 } 8130 } 8131 8132 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8133 { 8134 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8135 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8136 8137 for (i = 0; i < opr_sz; ++i) { 8138 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 8139 } 8140 } 8141 8142 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 8143 { 8144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8145 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 8146 8147 for (i = 0; i < opr_sz; ++i) { 8148 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 8149 } 8150 } 8151 8152 /* 8153 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 8154 * See hasless(v,1) from 8155 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 8156 */ 8157 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 8158 { 8159 int bits = 8 << esz; 8160 uint64_t ones = dup_const(esz, 1); 8161 uint64_t signs = ones << (bits - 1); 8162 uint64_t cmp0, cmp1; 8163 8164 cmp1 = dup_const(esz, n); 8165 cmp0 = cmp1 ^ m0; 8166 cmp1 = cmp1 ^ m1; 8167 cmp0 = (cmp0 - ones) & ~cmp0; 8168 cmp1 = (cmp1 - ones) & ~cmp1; 8169 return (cmp0 | cmp1) & signs; 8170 } 8171 8172 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 8173 uint32_t desc, int esz, bool nmatch) 8174 { 8175 uint16_t esz_mask = pred_esz_masks[esz]; 8176 intptr_t opr_sz = simd_oprsz(desc); 8177 uint32_t flags = PREDTEST_INIT; 8178 intptr_t i, j, k; 8179 8180 for (i = 0; i < opr_sz; i += 16) { 8181 uint64_t m0 = *(uint64_t *)(vm + i); 8182 uint64_t m1 = *(uint64_t *)(vm + i + 8); 8183 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 8184 uint16_t out = 0; 8185 8186 for (j = 0; j < 16; j += 8) { 8187 uint64_t n = *(uint64_t *)(vn + i + j); 8188 8189 for (k = 0; k < 8; k += 1 << esz) { 8190 if (pg & (1 << (j + k))) { 8191 bool o = do_match2(n >> (k * 8), m0, m1, esz); 8192 out |= (o ^ nmatch) << (j + k); 8193 } 8194 } 8195 } 8196 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 8197 flags = iter_predtest_fwd(out, pg, flags); 8198 } 8199 return flags; 8200 } 8201 8202 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 8203 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 8204 { \ 8205 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 8206 } 8207 8208 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 8209 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 8210 8211 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 8212 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 8213 8214 #undef DO_PPZZ_MATCH 8215 8216 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 8217 uint32_t desc) 8218 { 8219 ARMVectorReg scratch; 8220 intptr_t i, j; 8221 intptr_t opr_sz = simd_oprsz(desc); 8222 uint32_t *d = vd, *n = vn, *m = vm; 8223 uint8_t *pg = vg; 8224 8225 if (d == n) { 8226 n = memcpy(&scratch, n, opr_sz); 8227 if (d == m) { 8228 m = n; 8229 } 8230 } else if (d == m) { 8231 m = memcpy(&scratch, m, opr_sz); 8232 } 8233 8234 for (i = 0; i < opr_sz; i += 4) { 8235 uint64_t count = 0; 8236 uint8_t pred; 8237 8238 pred = pg[H1(i >> 3)] >> (i & 7); 8239 if (pred & 1) { 8240 uint32_t nn = n[H4(i >> 2)]; 8241 8242 for (j = 0; j <= i; j += 4) { 8243 pred = pg[H1(j >> 3)] >> (j & 7); 8244 if ((pred & 1) && nn == m[H4(j >> 2)]) { 8245 ++count; 8246 } 8247 } 8248 } 8249 d[H4(i >> 2)] = count; 8250 } 8251 } 8252 8253 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 8254 uint32_t desc) 8255 { 8256 ARMVectorReg scratch; 8257 intptr_t i, j; 8258 intptr_t opr_sz = simd_oprsz(desc); 8259 uint64_t *d = vd, *n = vn, *m = vm; 8260 uint8_t *pg = vg; 8261 8262 if (d == n) { 8263 n = memcpy(&scratch, n, opr_sz); 8264 if (d == m) { 8265 m = n; 8266 } 8267 } else if (d == m) { 8268 m = memcpy(&scratch, m, opr_sz); 8269 } 8270 8271 for (i = 0; i < opr_sz / 8; ++i) { 8272 uint64_t count = 0; 8273 if (pg[H1(i)] & 1) { 8274 uint64_t nn = n[i]; 8275 for (j = 0; j <= i; ++j) { 8276 if ((pg[H1(j)] & 1) && nn == m[j]) { 8277 ++count; 8278 } 8279 } 8280 } 8281 d[i] = count; 8282 } 8283 } 8284 8285 /* 8286 * Returns the number of bytes in m0 and m1 that match n. 8287 * Unlike do_match2 we don't just need true/false, we need an exact count. 8288 * This requires two extra logical operations. 8289 */ 8290 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 8291 { 8292 const uint64_t mask = dup_const(MO_8, 0x7f); 8293 uint64_t cmp0, cmp1; 8294 8295 cmp1 = dup_const(MO_8, n); 8296 cmp0 = cmp1 ^ m0; 8297 cmp1 = cmp1 ^ m1; 8298 8299 /* 8300 * 1: clear msb of each byte to avoid carry to next byte (& mask) 8301 * 2: carry in to msb if byte != 0 (+ mask) 8302 * 3: set msb if cmp has msb set (| cmp) 8303 * 4: set ~msb to ignore them (| mask) 8304 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 8305 * 5: invert, resulting in 0x80 if and only if byte == 0. 8306 */ 8307 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 8308 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 8309 8310 /* 8311 * Combine the two compares in a way that the bits do 8312 * not overlap, and so preserves the count of set bits. 8313 * If the host has an efficient instruction for ctpop, 8314 * then ctpop(x) + ctpop(y) has the same number of 8315 * operations as ctpop(x | (y >> 1)). If the host does 8316 * not have an efficient ctpop, then we only want to 8317 * use it once. 8318 */ 8319 return ctpop64(cmp0 | (cmp1 >> 1)); 8320 } 8321 8322 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 8323 { 8324 intptr_t i, j; 8325 intptr_t opr_sz = simd_oprsz(desc); 8326 8327 for (i = 0; i < opr_sz; i += 16) { 8328 uint64_t n0 = *(uint64_t *)(vn + i); 8329 uint64_t m0 = *(uint64_t *)(vm + i); 8330 uint64_t n1 = *(uint64_t *)(vn + i + 8); 8331 uint64_t m1 = *(uint64_t *)(vm + i + 8); 8332 uint64_t out0 = 0; 8333 uint64_t out1 = 0; 8334 8335 for (j = 0; j < 64; j += 8) { 8336 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 8337 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 8338 out0 |= cnt0 << j; 8339 out1 |= cnt1 << j; 8340 } 8341 8342 *(uint64_t *)(vd + i) = out0; 8343 *(uint64_t *)(vd + i + 8) = out1; 8344 } 8345 } 8346 8347 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 8348 { 8349 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8350 int shr = simd_data(desc); 8351 int shl = 8 - shr; 8352 uint64_t mask = dup_const(MO_8, 0xff >> shr); 8353 uint64_t *d = vd, *n = vn, *m = vm; 8354 8355 for (i = 0; i < opr_sz; ++i) { 8356 uint64_t t = n[i] ^ m[i]; 8357 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 8358 } 8359 } 8360 8361 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 8362 { 8363 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 8364 int shr = simd_data(desc); 8365 int shl = 16 - shr; 8366 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 8367 uint64_t *d = vd, *n = vn, *m = vm; 8368 8369 for (i = 0; i < opr_sz; ++i) { 8370 uint64_t t = n[i] ^ m[i]; 8371 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 8372 } 8373 } 8374 8375 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 8376 { 8377 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 8378 int shr = simd_data(desc); 8379 uint32_t *d = vd, *n = vn, *m = vm; 8380 8381 for (i = 0; i < opr_sz; ++i) { 8382 d[i] = ror32(n[i] ^ m[i], shr); 8383 } 8384 } 8385 8386 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 8387 float_status *status, uint32_t desc) 8388 { 8389 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 8390 8391 for (s = 0; s < opr_sz; ++s) { 8392 float32 *n = vn + s * sizeof(float32) * 4; 8393 float32 *m = vm + s * sizeof(float32) * 4; 8394 float32 *a = va + s * sizeof(float32) * 4; 8395 float32 *d = vd + s * sizeof(float32) * 4; 8396 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 8397 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 8398 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 8399 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 8400 float32 p0, p1; 8401 8402 /* i = 0, j = 0 */ 8403 p0 = float32_mul(n00, m00, status); 8404 p1 = float32_mul(n01, m01, status); 8405 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 8406 8407 /* i = 0, j = 1 */ 8408 p0 = float32_mul(n00, m10, status); 8409 p1 = float32_mul(n01, m11, status); 8410 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 8411 8412 /* i = 1, j = 0 */ 8413 p0 = float32_mul(n10, m00, status); 8414 p1 = float32_mul(n11, m01, status); 8415 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 8416 8417 /* i = 1, j = 1 */ 8418 p0 = float32_mul(n10, m10, status); 8419 p1 = float32_mul(n11, m11, status); 8420 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 8421 } 8422 } 8423 8424 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 8425 float_status *status, uint32_t desc) 8426 { 8427 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 8428 8429 for (s = 0; s < opr_sz; ++s) { 8430 float64 *n = vn + s * sizeof(float64) * 4; 8431 float64 *m = vm + s * sizeof(float64) * 4; 8432 float64 *a = va + s * sizeof(float64) * 4; 8433 float64 *d = vd + s * sizeof(float64) * 4; 8434 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 8435 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 8436 float64 p0, p1; 8437 8438 /* i = 0, j = 0 */ 8439 p0 = float64_mul(n00, m00, status); 8440 p1 = float64_mul(n01, m01, status); 8441 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 8442 8443 /* i = 0, j = 1 */ 8444 p0 = float64_mul(n00, m10, status); 8445 p1 = float64_mul(n01, m11, status); 8446 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 8447 8448 /* i = 1, j = 0 */ 8449 p0 = float64_mul(n10, m00, status); 8450 p1 = float64_mul(n11, m01, status); 8451 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 8452 8453 /* i = 1, j = 1 */ 8454 p0 = float64_mul(n10, m10, status); 8455 p1 = float64_mul(n11, m11, status); 8456 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 8457 } 8458 } 8459 8460 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 8461 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 8462 float_status *status, uint32_t desc) \ 8463 { \ 8464 intptr_t i = simd_oprsz(desc); \ 8465 uint64_t *g = vg; \ 8466 do { \ 8467 uint64_t pg = g[(i - 1) >> 6]; \ 8468 do { \ 8469 i -= sizeof(TYPEW); \ 8470 if (likely((pg >> (i & 63)) & 1)) { \ 8471 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 8472 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 8473 } \ 8474 } while (i & 63); \ 8475 } while (i != 0); \ 8476 } 8477 8478 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 8479 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 8480 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 8481 8482 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 8483 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 8484 float_status *status, uint32_t desc) \ 8485 { \ 8486 intptr_t i = simd_oprsz(desc); \ 8487 uint64_t *g = vg; \ 8488 do { \ 8489 uint64_t pg = g[(i - 1) >> 6]; \ 8490 do { \ 8491 i -= sizeof(TYPEW); \ 8492 if (likely((pg >> (i & 63)) & 1)) { \ 8493 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 8494 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 8495 } \ 8496 } while (i & 63); \ 8497 } while (i != 0); \ 8498 } 8499 8500 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 8501 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 8502 8503 #undef DO_FCVTLT 8504 #undef DO_FCVTNT 8505 8506 void HELPER(pext)(void *vd, uint32_t png, uint32_t desc) 8507 { 8508 int pl = FIELD_EX32(desc, PREDDESC, OPRSZ); 8509 int vl = pl * 8; 8510 unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ); 8511 int part = FIELD_EX32(desc, PREDDESC, DATA); 8512 DecodeCounter p = decode_counter(png, vl, v_esz); 8513 uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride]; 8514 ARMPredicateReg *d = vd; 8515 8516 /* 8517 * Convert from element count to byte count and adjust 8518 * for the portion of the 4*VL counter to be extracted. 8519 */ 8520 int b_count = (p.count << v_esz) - vl * part; 8521 8522 memset(d, 0, sizeof(*d)); 8523 if (p.invert) { 8524 if (b_count <= 0) { 8525 do_whilel(vd, mask, vl, vl); 8526 } else if (b_count < vl) { 8527 do_whileg(vd, mask, vl - b_count, vl); 8528 } 8529 } else if (b_count > 0) { 8530 do_whilel(vd, mask, MIN(b_count, vl), vl); 8531 } 8532 } 8533