1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/page-protection.h" 25 #include "exec/helper-proto.h" 26 #include "tcg/tcg-gvec-desc.h" 27 #include "fpu/softfloat.h" 28 #include "tcg/tcg.h" 29 #include "vec_internal.h" 30 #include "sve_ldst_internal.h" 31 #include "hw/core/tcg-cpu-ops.h" 32 #ifdef CONFIG_USER_ONLY 33 #include "user/page-protection.h" 34 #endif 35 36 37 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 38 * 39 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 40 * and bit 0 set if C is set. Compare the definitions of these variables 41 * within CPUARMState. 42 */ 43 44 /* For no G bits set, NZCV = C. */ 45 #define PREDTEST_INIT 1 46 47 /* This is an iterative function, called for each Pd and Pg word 48 * moving forward. 49 */ 50 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 51 { 52 if (likely(g)) { 53 /* Compute N from first D & G. 54 Use bit 2 to signal first G bit seen. */ 55 if (!(flags & 4)) { 56 flags |= ((d & (g & -g)) != 0) << 31; 57 flags |= 4; 58 } 59 60 /* Accumulate Z from each D & G. */ 61 flags |= ((d & g) != 0) << 1; 62 63 /* Compute C from last !(D & G). Replace previous. */ 64 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 65 } 66 return flags; 67 } 68 69 /* This is an iterative function, called for each Pd and Pg word 70 * moving backward. 71 */ 72 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 73 { 74 if (likely(g)) { 75 /* Compute C from first (i.e last) !(D & G). 76 Use bit 2 to signal first G bit seen. */ 77 if (!(flags & 4)) { 78 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 79 flags |= (d & pow2floor(g)) == 0; 80 } 81 82 /* Accumulate Z from each D & G. */ 83 flags |= ((d & g) != 0) << 1; 84 85 /* Compute N from last (i.e first) D & G. Replace previous. */ 86 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 87 } 88 return flags; 89 } 90 91 /* The same for a single word predicate. */ 92 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 93 { 94 return iter_predtest_fwd(d, g, PREDTEST_INIT); 95 } 96 97 /* The same for a multi-word predicate. */ 98 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 99 { 100 uint32_t flags = PREDTEST_INIT; 101 uint64_t *d = vd, *g = vg; 102 uintptr_t i = 0; 103 104 do { 105 flags = iter_predtest_fwd(d[i], g[i], flags); 106 } while (++i < words); 107 108 return flags; 109 } 110 111 /* Similarly for single word elements. */ 112 static inline uint64_t expand_pred_s(uint8_t byte) 113 { 114 static const uint64_t word[] = { 115 [0x01] = 0x00000000ffffffffull, 116 [0x10] = 0xffffffff00000000ull, 117 [0x11] = 0xffffffffffffffffull, 118 }; 119 return word[byte & 0x11]; 120 } 121 122 #define LOGICAL_PPPP(NAME, FUNC) \ 123 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 124 { \ 125 uintptr_t opr_sz = simd_oprsz(desc); \ 126 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 127 uintptr_t i; \ 128 for (i = 0; i < opr_sz / 8; ++i) { \ 129 d[i] = FUNC(n[i], m[i], g[i]); \ 130 } \ 131 } 132 133 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 134 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 135 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 136 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 137 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 138 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 139 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 140 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 141 142 LOGICAL_PPPP(sve_and_pppp, DO_AND) 143 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 144 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 145 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 146 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 147 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 148 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 149 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 150 151 #undef DO_AND 152 #undef DO_BIC 153 #undef DO_EOR 154 #undef DO_ORR 155 #undef DO_ORN 156 #undef DO_NOR 157 #undef DO_NAND 158 #undef DO_SEL 159 #undef LOGICAL_PPPP 160 161 /* Fully general three-operand expander, controlled by a predicate. 162 * This is complicated by the host-endian storage of the register file. 163 */ 164 /* ??? I don't expect the compiler could ever vectorize this itself. 165 * With some tables we can convert bit masks to byte masks, and with 166 * extra care wrt byte/word ordering we could use gcc generic vectors 167 * and do 16 bytes at a time. 168 */ 169 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 170 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 171 { \ 172 intptr_t i, opr_sz = simd_oprsz(desc); \ 173 for (i = 0; i < opr_sz; ) { \ 174 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 175 do { \ 176 if (pg & 1) { \ 177 TYPE nn = *(TYPE *)(vn + H(i)); \ 178 TYPE mm = *(TYPE *)(vm + H(i)); \ 179 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 180 } \ 181 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 182 } while (i & 15); \ 183 } \ 184 } 185 186 /* Similarly, specialized for 64-bit operands. */ 187 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 189 { \ 190 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 191 TYPE *d = vd, *n = vn, *m = vm; \ 192 uint8_t *pg = vg; \ 193 for (i = 0; i < opr_sz; i += 1) { \ 194 if (pg[H1(i)] & 1) { \ 195 TYPE nn = n[i], mm = m[i]; \ 196 d[i] = OP(nn, mm); \ 197 } \ 198 } \ 199 } 200 201 #define DO_AND(N, M) (N & M) 202 #define DO_EOR(N, M) (N ^ M) 203 #define DO_ORR(N, M) (N | M) 204 #define DO_BIC(N, M) (N & ~M) 205 #define DO_ADD(N, M) (N + M) 206 #define DO_SUB(N, M) (N - M) 207 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 208 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 209 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 210 #define DO_MUL(N, M) (N * M) 211 212 213 /* 214 * We must avoid the C undefined behaviour cases: division by 215 * zero and signed division of INT_MIN by -1. Both of these 216 * have architecturally defined required results for Arm. 217 * We special case all signed divisions by -1 to avoid having 218 * to deduce the minimum integer for the type involved. 219 */ 220 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 221 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 222 223 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 224 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 225 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 226 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 227 228 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 229 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 230 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 231 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 232 233 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 234 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 235 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 236 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 237 238 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 239 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 240 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 241 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 242 243 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 244 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 245 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 246 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 247 248 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 249 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 250 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 251 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 252 253 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 254 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 255 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 256 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 257 258 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 259 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 260 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 261 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 262 263 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 264 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 265 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 266 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 267 268 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 269 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 270 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 271 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 272 273 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 274 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 275 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 276 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 277 278 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 279 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 280 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 281 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 282 283 /* Because the computation type is at least twice as large as required, 284 these work for both signed and unsigned source types. */ 285 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 286 { 287 return (n * m) >> 8; 288 } 289 290 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 291 { 292 return (n * m) >> 16; 293 } 294 295 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 296 { 297 return (n * m) >> 32; 298 } 299 300 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 301 { 302 uint64_t lo, hi; 303 muls64(&lo, &hi, n, m); 304 return hi; 305 } 306 307 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 308 { 309 uint64_t lo, hi; 310 mulu64(&lo, &hi, n, m); 311 return hi; 312 } 313 314 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 315 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 316 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 317 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 318 319 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 320 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 321 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 322 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 323 324 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 325 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 326 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 327 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 328 329 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 330 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 331 332 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 333 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 334 335 /* Note that all bits of the shift are significant 336 and not modulo the element size. */ 337 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 338 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 339 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 340 341 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 342 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 343 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 344 345 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 346 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 347 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 348 349 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 350 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 351 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 352 353 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 354 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 355 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 356 357 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 358 { 359 int8_t n1 = n, n2 = n >> 8; 360 return m + n1 + n2; 361 } 362 363 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 364 { 365 int16_t n1 = n, n2 = n >> 16; 366 return m + n1 + n2; 367 } 368 369 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 370 { 371 int32_t n1 = n, n2 = n >> 32; 372 return m + n1 + n2; 373 } 374 375 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 376 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 377 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 378 379 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 380 { 381 uint8_t n1 = n, n2 = n >> 8; 382 return m + n1 + n2; 383 } 384 385 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 386 { 387 uint16_t n1 = n, n2 = n >> 16; 388 return m + n1 + n2; 389 } 390 391 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 392 { 393 uint32_t n1 = n, n2 = n >> 32; 394 return m + n1 + n2; 395 } 396 397 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 398 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 399 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 400 401 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 402 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 403 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 404 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 405 406 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 407 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 408 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 409 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 410 411 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 412 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 413 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 414 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 415 416 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 417 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 418 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 419 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 420 421 /* 422 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 423 * We pass in a pointer to a dummy saturation field to trigger 424 * the saturating arithmetic but discard the information about 425 * whether it has occurred. 426 */ 427 #define do_sqshl_b(n, m) \ 428 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 429 #define do_sqshl_h(n, m) \ 430 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 431 #define do_sqshl_s(n, m) \ 432 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 433 #define do_sqshl_d(n, m) \ 434 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 435 436 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 437 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 438 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 439 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 440 441 #define do_uqshl_b(n, m) \ 442 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 443 #define do_uqshl_h(n, m) \ 444 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 445 #define do_uqshl_s(n, m) \ 446 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 447 #define do_uqshl_d(n, m) \ 448 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 449 450 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 451 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 452 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 453 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 454 455 #define do_sqrshl_b(n, m) \ 456 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 457 #define do_sqrshl_h(n, m) \ 458 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 459 #define do_sqrshl_s(n, m) \ 460 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 461 #define do_sqrshl_d(n, m) \ 462 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 463 464 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 465 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 466 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 467 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 468 469 #undef do_sqrshl_d 470 471 #define do_uqrshl_b(n, m) \ 472 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 473 #define do_uqrshl_h(n, m) \ 474 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 475 #define do_uqrshl_s(n, m) \ 476 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 477 #define do_uqrshl_d(n, m) \ 478 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 479 480 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 481 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 482 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 483 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 484 485 #undef do_uqrshl_d 486 487 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 488 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 489 490 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 491 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 492 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 493 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 494 495 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 496 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 497 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 498 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 499 500 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 501 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 502 503 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 504 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 505 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 506 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 507 508 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 509 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 510 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 511 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 512 513 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 514 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 515 516 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 517 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 518 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 519 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 520 521 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 522 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 523 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 524 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 525 526 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 527 { 528 return val >= max ? max : val <= min ? min : val; 529 } 530 531 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 532 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 533 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 534 535 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 536 { 537 int64_t r = n + m; 538 if (((r ^ n) & ~(n ^ m)) < 0) { 539 /* Signed overflow. */ 540 return r < 0 ? INT64_MAX : INT64_MIN; 541 } 542 return r; 543 } 544 545 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 546 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 547 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 548 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 549 550 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 551 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 552 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 553 554 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 555 { 556 uint64_t r = n + m; 557 return r < n ? UINT64_MAX : r; 558 } 559 560 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 561 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 562 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 563 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 564 565 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 566 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 567 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 568 569 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 570 { 571 int64_t r = n - m; 572 if (((r ^ n) & (n ^ m)) < 0) { 573 /* Signed overflow. */ 574 return r < 0 ? INT64_MAX : INT64_MIN; 575 } 576 return r; 577 } 578 579 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 580 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 581 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 582 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 583 584 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 585 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 586 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 587 588 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 589 { 590 return n > m ? n - m : 0; 591 } 592 593 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 594 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 595 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 596 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 597 598 #define DO_SUQADD_B(n, m) \ 599 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 600 #define DO_SUQADD_H(n, m) \ 601 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 602 #define DO_SUQADD_S(n, m) \ 603 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 604 605 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 606 { 607 uint64_t r = n + m; 608 609 if (n < 0) { 610 /* Note that m - abs(n) cannot underflow. */ 611 if (r > INT64_MAX) { 612 /* Result is either very large positive or negative. */ 613 if (m > -n) { 614 /* m > abs(n), so r is a very large positive. */ 615 return INT64_MAX; 616 } 617 /* Result is negative. */ 618 } 619 } else { 620 /* Both inputs are positive: check for overflow. */ 621 if (r < m || r > INT64_MAX) { 622 return INT64_MAX; 623 } 624 } 625 return r; 626 } 627 628 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 629 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 630 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 631 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 632 633 #define DO_USQADD_B(n, m) \ 634 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 635 #define DO_USQADD_H(n, m) \ 636 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 637 #define DO_USQADD_S(n, m) \ 638 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 639 640 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 641 { 642 uint64_t r = n + m; 643 644 if (m < 0) { 645 return n < -m ? 0 : r; 646 } 647 return r < n ? UINT64_MAX : r; 648 } 649 650 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 651 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 652 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 653 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 654 655 #undef DO_ZPZZ 656 #undef DO_ZPZZ_D 657 658 /* 659 * Three operand expander, operating on element pairs. 660 * If the slot I is even, the elements from from VN {I, I+1}. 661 * If the slot I is odd, the elements from from VM {I-1, I}. 662 * Load all of the input elements in each pair before overwriting output. 663 */ 664 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 665 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 666 { \ 667 intptr_t i, opr_sz = simd_oprsz(desc); \ 668 for (i = 0; i < opr_sz; ) { \ 669 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 670 do { \ 671 TYPE n0 = *(TYPE *)(vn + H(i)); \ 672 TYPE m0 = *(TYPE *)(vm + H(i)); \ 673 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 674 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 675 if (pg & 1) { \ 676 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 677 } \ 678 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 679 if (pg & 1) { \ 680 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 681 } \ 682 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 683 } while (i & 15); \ 684 } \ 685 } 686 687 /* Similarly, specialized for 64-bit operands. */ 688 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 689 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 690 { \ 691 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 692 TYPE *d = vd, *n = vn, *m = vm; \ 693 uint8_t *pg = vg; \ 694 for (i = 0; i < opr_sz; i += 2) { \ 695 TYPE n0 = n[i], n1 = n[i + 1]; \ 696 TYPE m0 = m[i], m1 = m[i + 1]; \ 697 if (pg[H1(i)] & 1) { \ 698 d[i] = OP(n0, n1); \ 699 } \ 700 if (pg[H1(i + 1)] & 1) { \ 701 d[i + 1] = OP(m0, m1); \ 702 } \ 703 } \ 704 } 705 706 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 707 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 708 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 709 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 710 711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 714 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 715 716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 719 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 720 721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 724 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 725 726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 729 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 730 731 #undef DO_ZPZZ_PAIR 732 #undef DO_ZPZZ_PAIR_D 733 734 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 735 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 736 float_status *status, uint32_t desc) \ 737 { \ 738 intptr_t i, opr_sz = simd_oprsz(desc); \ 739 for (i = 0; i < opr_sz; ) { \ 740 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 741 do { \ 742 TYPE n0 = *(TYPE *)(vn + H(i)); \ 743 TYPE m0 = *(TYPE *)(vm + H(i)); \ 744 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 745 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 746 if (pg & 1) { \ 747 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 748 } \ 749 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 750 if (pg & 1) { \ 751 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 752 } \ 753 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 754 } while (i & 15); \ 755 } \ 756 } 757 758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 761 762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 765 766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 769 770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 773 774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 777 778 #undef DO_ZPZZ_PAIR_FP 779 780 /* Three-operand expander, controlled by a predicate, in which the 781 * third operand is "wide". That is, for D = N op M, the same 64-bit 782 * value of M is used with all of the narrower values of N. 783 */ 784 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 785 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 786 { \ 787 intptr_t i, opr_sz = simd_oprsz(desc); \ 788 for (i = 0; i < opr_sz; ) { \ 789 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 790 TYPEW mm = *(TYPEW *)(vm + i); \ 791 do { \ 792 if (pg & 1) { \ 793 TYPE nn = *(TYPE *)(vn + H(i)); \ 794 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 795 } \ 796 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 797 } while (i & 7); \ 798 } \ 799 } 800 801 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 802 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 803 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 804 805 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 806 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 807 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 808 809 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 810 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 811 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 812 813 #undef DO_ZPZW 814 815 /* Fully general two-operand expander, controlled by a predicate. 816 */ 817 #define DO_ZPZ(NAME, TYPE, H, OP) \ 818 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 819 { \ 820 intptr_t i, opr_sz = simd_oprsz(desc); \ 821 for (i = 0; i < opr_sz; ) { \ 822 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 823 do { \ 824 if (pg & 1) { \ 825 TYPE nn = *(TYPE *)(vn + H(i)); \ 826 *(TYPE *)(vd + H(i)) = OP(nn); \ 827 } \ 828 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 829 } while (i & 15); \ 830 } \ 831 } 832 833 /* Similarly, specialized for 64-bit operands. */ 834 #define DO_ZPZ_D(NAME, TYPE, OP) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 836 { \ 837 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 838 TYPE *d = vd, *n = vn; \ 839 uint8_t *pg = vg; \ 840 for (i = 0; i < opr_sz; i += 1) { \ 841 if (pg[H1(i)] & 1) { \ 842 TYPE nn = n[i]; \ 843 d[i] = OP(nn); \ 844 } \ 845 } \ 846 } 847 848 #define DO_CLS_B(N) (clrsb32(N) - 24) 849 #define DO_CLS_H(N) (clrsb32(N) - 16) 850 851 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 852 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 853 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 854 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 855 856 #define DO_CLZ_B(N) (clz32(N) - 24) 857 #define DO_CLZ_H(N) (clz32(N) - 16) 858 859 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 860 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 861 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 862 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 863 864 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 865 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 866 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 867 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 868 869 #define DO_CNOT(N) (N == 0) 870 871 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 872 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 873 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 874 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 875 876 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 877 878 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 879 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 880 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 881 882 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 883 884 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 885 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 886 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 887 888 #define DO_NOT(N) (~N) 889 890 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 891 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 892 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 893 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 894 895 #define DO_SXTB(N) ((int8_t)N) 896 #define DO_SXTH(N) ((int16_t)N) 897 #define DO_SXTS(N) ((int32_t)N) 898 #define DO_UXTB(N) ((uint8_t)N) 899 #define DO_UXTH(N) ((uint16_t)N) 900 #define DO_UXTS(N) ((uint32_t)N) 901 902 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 903 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 904 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 905 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 906 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 907 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 908 909 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 910 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 911 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 912 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 913 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 914 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 915 916 #define DO_ABS(N) (N < 0 ? -N : N) 917 918 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 919 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 920 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 921 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 922 923 #define DO_NEG(N) (-N) 924 925 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 926 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 927 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 928 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 929 930 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 931 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 932 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 933 934 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 935 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 936 937 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 938 939 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 940 { 941 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 942 uint64_t *d = vd, *n = vn; 943 uint8_t *pg = vg; 944 945 for (i = 0; i < opr_sz; i += 2) { 946 if (pg[H1(i)] & 1) { 947 uint64_t n0 = n[i + 0]; 948 uint64_t n1 = n[i + 1]; 949 d[i + 0] = n1; 950 d[i + 1] = n0; 951 } 952 } 953 } 954 955 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 956 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 957 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 958 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 959 960 #define DO_SQABS(X) \ 961 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 962 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 963 964 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 965 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 966 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 967 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 968 969 #define DO_SQNEG(X) \ 970 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 971 x_ == min_ ? -min_ - 1 : -x_; }) 972 973 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 974 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 975 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 976 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 977 978 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 979 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 980 981 /* Three-operand expander, unpredicated, in which the third operand is "wide". 982 */ 983 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 984 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 985 { \ 986 intptr_t i, opr_sz = simd_oprsz(desc); \ 987 for (i = 0; i < opr_sz; ) { \ 988 TYPEW mm = *(TYPEW *)(vm + i); \ 989 do { \ 990 TYPE nn = *(TYPE *)(vn + H(i)); \ 991 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 992 i += sizeof(TYPE); \ 993 } while (i & 7); \ 994 } \ 995 } 996 997 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 998 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 999 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1000 1001 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1002 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1003 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1004 1005 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1006 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1007 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1008 1009 #undef DO_ZZW 1010 1011 #undef DO_CLS_B 1012 #undef DO_CLS_H 1013 #undef DO_CLZ_B 1014 #undef DO_CLZ_H 1015 #undef DO_CNOT 1016 #undef DO_FABS 1017 #undef DO_FNEG 1018 #undef DO_ABS 1019 #undef DO_NEG 1020 #undef DO_ZPZ 1021 #undef DO_ZPZ_D 1022 1023 /* 1024 * Three-operand expander, unpredicated, in which the two inputs are 1025 * selected from the top or bottom half of the wide column. 1026 */ 1027 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1028 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1029 { \ 1030 intptr_t i, opr_sz = simd_oprsz(desc); \ 1031 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1032 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1033 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1034 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1035 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1036 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1037 } \ 1038 } 1039 1040 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1041 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1042 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1043 1044 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1045 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1046 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1047 1048 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1049 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1050 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1051 1052 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1053 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1054 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1055 1056 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1057 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1058 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1059 1060 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1061 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1062 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1063 1064 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1065 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1066 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1067 1068 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1069 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1070 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1071 1072 /* Note that the multiply cannot overflow, but the doubling can. */ 1073 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1074 { 1075 int16_t val = n * m; 1076 return DO_SQADD_H(val, val); 1077 } 1078 1079 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1080 { 1081 int32_t val = n * m; 1082 return DO_SQADD_S(val, val); 1083 } 1084 1085 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1086 { 1087 int64_t val = n * m; 1088 return do_sqadd_d(val, val); 1089 } 1090 1091 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1092 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1093 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1094 1095 #undef DO_ZZZ_TB 1096 1097 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1098 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1099 { \ 1100 intptr_t i, opr_sz = simd_oprsz(desc); \ 1101 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1102 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1103 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1104 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1105 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1106 } \ 1107 } 1108 1109 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1110 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1111 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1112 1113 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1114 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1115 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1116 1117 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1118 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1119 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1120 1121 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1122 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1123 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1124 1125 #undef DO_ZZZ_WTB 1126 1127 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1128 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1129 { \ 1130 intptr_t i, opr_sz = simd_oprsz(desc); \ 1131 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1132 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1133 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1134 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1135 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1136 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1137 } \ 1138 } 1139 1140 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1141 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1142 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1143 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1144 1145 #undef DO_ZZZ_NTB 1146 1147 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1148 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1149 { \ 1150 intptr_t i, opr_sz = simd_oprsz(desc); \ 1151 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1152 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1153 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1154 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1155 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1156 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1157 } \ 1158 } 1159 1160 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1161 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1162 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1163 1164 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1165 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1166 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1167 1168 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1169 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1170 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1171 1172 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1173 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1174 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1175 1176 #define DO_NMUL(N, M) -(N * M) 1177 1178 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1179 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1180 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1181 1182 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1183 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1184 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1185 1186 #undef DO_ZZZW_ACC 1187 1188 #define DO_XTNB(NAME, TYPE, OP) \ 1189 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1190 { \ 1191 intptr_t i, opr_sz = simd_oprsz(desc); \ 1192 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1193 TYPE nn = *(TYPE *)(vn + i); \ 1194 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1195 *(TYPE *)(vd + i) = nn; \ 1196 } \ 1197 } 1198 1199 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1200 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1201 { \ 1202 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1203 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1204 TYPE nn = *(TYPE *)(vn + i); \ 1205 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1206 } \ 1207 } 1208 1209 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1210 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1211 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1212 1213 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1214 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1215 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1216 1217 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1218 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1219 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1220 1221 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1222 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1223 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1224 1225 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1226 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1227 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1228 1229 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1230 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1231 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1232 1233 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1234 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1235 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1236 1237 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1238 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1239 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1240 1241 #undef DO_XTNB 1242 #undef DO_XTNT 1243 1244 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1245 { 1246 intptr_t i, opr_sz = simd_oprsz(desc); 1247 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1248 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1249 uint32_t *a = va, *n = vn; 1250 uint64_t *d = vd, *m = vm; 1251 1252 for (i = 0; i < opr_sz / 8; ++i) { 1253 uint32_t e1 = a[2 * i + H4(0)]; 1254 uint32_t e2 = n[2 * i + sel] ^ inv; 1255 uint64_t c = extract64(m[i], 32, 1); 1256 /* Compute and store the entire 33-bit result at once. */ 1257 d[i] = c + e1 + e2; 1258 } 1259 } 1260 1261 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1262 { 1263 intptr_t i, opr_sz = simd_oprsz(desc); 1264 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1265 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1266 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1267 1268 for (i = 0; i < opr_sz / 8; i += 2) { 1269 Int128 e1 = int128_make64(a[i]); 1270 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1271 Int128 c = int128_make64(m[i + 1] & 1); 1272 Int128 r = int128_add(int128_add(e1, e2), c); 1273 d[i + 0] = int128_getlo(r); 1274 d[i + 1] = int128_gethi(r); 1275 } 1276 } 1277 1278 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1279 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1280 { \ 1281 intptr_t i, opr_sz = simd_oprsz(desc); \ 1282 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1283 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1284 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1285 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1286 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1287 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1288 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1289 } \ 1290 } 1291 1292 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1293 do_sqdmull_h, DO_SQADD_H) 1294 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1295 do_sqdmull_s, DO_SQADD_S) 1296 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1297 do_sqdmull_d, do_sqadd_d) 1298 1299 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1300 do_sqdmull_h, DO_SQSUB_H) 1301 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1302 do_sqdmull_s, DO_SQSUB_S) 1303 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1304 do_sqdmull_d, do_sqsub_d) 1305 1306 #undef DO_SQDMLAL 1307 1308 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1309 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1310 { \ 1311 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1312 int rot = simd_data(desc); \ 1313 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1314 bool sub_r = rot == 1 || rot == 2; \ 1315 bool sub_i = rot >= 2; \ 1316 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1317 for (i = 0; i < opr_sz; i += 2) { \ 1318 TYPE elt1_a = n[H(i + sel_a)]; \ 1319 TYPE elt2_a = m[H(i + sel_a)]; \ 1320 TYPE elt2_b = m[H(i + sel_b)]; \ 1321 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1322 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1323 } \ 1324 } 1325 1326 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1327 1328 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1329 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1330 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1331 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1332 1333 #define DO_SQRDMLAH_B(N, M, A, S) \ 1334 do_sqrdmlah_b(N, M, A, S, true) 1335 #define DO_SQRDMLAH_H(N, M, A, S) \ 1336 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1337 #define DO_SQRDMLAH_S(N, M, A, S) \ 1338 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1339 #define DO_SQRDMLAH_D(N, M, A, S) \ 1340 do_sqrdmlah_d(N, M, A, S, true) 1341 1342 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1343 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1344 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1345 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1346 1347 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1348 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1349 { \ 1350 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1351 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1352 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1353 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1354 bool sub_r = rot == 1 || rot == 2; \ 1355 bool sub_i = rot >= 2; \ 1356 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1357 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1358 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1359 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1360 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1361 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1362 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1363 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1364 } \ 1365 } \ 1366 } 1367 1368 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1369 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1370 1371 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1372 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1373 1374 #undef DO_CMLA 1375 #undef DO_CMLA_FUNC 1376 #undef DO_CMLA_IDX_FUNC 1377 #undef DO_SQRDMLAH_B 1378 #undef DO_SQRDMLAH_H 1379 #undef DO_SQRDMLAH_S 1380 #undef DO_SQRDMLAH_D 1381 1382 /* Note N and M are 4 elements bundled into one unit. */ 1383 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1384 int sel_a, int sel_b, int sub_i) 1385 { 1386 for (int i = 0; i <= 1; i++) { 1387 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1388 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1389 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1390 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1391 1392 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1393 } 1394 return a; 1395 } 1396 1397 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1398 int sel_a, int sel_b, int sub_i) 1399 { 1400 for (int i = 0; i <= 1; i++) { 1401 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1402 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1403 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1404 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1405 1406 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1407 } 1408 return a; 1409 } 1410 1411 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1412 void *va, uint32_t desc) 1413 { 1414 int opr_sz = simd_oprsz(desc); 1415 int rot = simd_data(desc); 1416 int sel_a = rot & 1; 1417 int sel_b = sel_a ^ 1; 1418 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1419 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1420 1421 for (int e = 0; e < opr_sz / 4; e++) { 1422 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1423 } 1424 } 1425 1426 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1427 void *va, uint32_t desc) 1428 { 1429 int opr_sz = simd_oprsz(desc); 1430 int rot = simd_data(desc); 1431 int sel_a = rot & 1; 1432 int sel_b = sel_a ^ 1; 1433 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1434 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1435 1436 for (int e = 0; e < opr_sz / 8; e++) { 1437 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1438 } 1439 } 1440 1441 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1442 void *va, uint32_t desc) 1443 { 1444 int opr_sz = simd_oprsz(desc); 1445 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1446 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1447 int sel_a = rot & 1; 1448 int sel_b = sel_a ^ 1; 1449 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1450 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1451 1452 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1453 uint32_t seg_m = m[seg + idx]; 1454 for (int e = 0; e < 4; e++) { 1455 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1456 sel_a, sel_b, sub_i); 1457 } 1458 } 1459 } 1460 1461 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1462 void *va, uint32_t desc) 1463 { 1464 int seg, opr_sz = simd_oprsz(desc); 1465 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1466 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1467 int sel_a = rot & 1; 1468 int sel_b = sel_a ^ 1; 1469 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1470 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1471 1472 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1473 uint64_t seg_m = m[seg + idx]; 1474 for (int e = 0; e < 2; e++) { 1475 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1476 sel_a, sel_b, sub_i); 1477 } 1478 } 1479 } 1480 1481 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1482 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1483 { \ 1484 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1485 intptr_t i, j, idx = simd_data(desc); \ 1486 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1487 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1488 TYPE mm = m[i]; \ 1489 for (j = 0; j < segment; j++) { \ 1490 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1491 } \ 1492 } \ 1493 } 1494 1495 #define DO_SQRDMLAH_H(N, M, A) \ 1496 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1497 #define DO_SQRDMLAH_S(N, M, A) \ 1498 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1499 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1500 1501 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1502 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1503 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1504 1505 #define DO_SQRDMLSH_H(N, M, A) \ 1506 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1507 #define DO_SQRDMLSH_S(N, M, A) \ 1508 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1509 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1510 1511 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1512 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1513 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1514 1515 #undef DO_ZZXZ 1516 1517 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1518 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1519 { \ 1520 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1521 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1522 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1523 for (i = 0; i < oprsz; i += 16) { \ 1524 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1525 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1526 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1527 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1528 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1529 } \ 1530 } \ 1531 } 1532 1533 #define DO_MLA(N, M, A) (A + N * M) 1534 1535 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1536 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1537 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1538 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1539 1540 #define DO_MLS(N, M, A) (A - N * M) 1541 1542 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1543 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1544 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1545 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1546 1547 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1548 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1549 1550 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1551 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1552 1553 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1554 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1555 1556 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1557 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1558 1559 #undef DO_MLA 1560 #undef DO_MLS 1561 #undef DO_ZZXW 1562 1563 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1564 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1565 { \ 1566 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1567 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1568 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1569 for (i = 0; i < oprsz; i += 16) { \ 1570 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1571 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1572 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1573 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1574 } \ 1575 } \ 1576 } 1577 1578 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1579 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1580 1581 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1582 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1583 1584 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1585 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1586 1587 #undef DO_ZZX 1588 1589 #define DO_BITPERM(NAME, TYPE, OP) \ 1590 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1591 { \ 1592 intptr_t i, opr_sz = simd_oprsz(desc); \ 1593 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1594 TYPE nn = *(TYPE *)(vn + i); \ 1595 TYPE mm = *(TYPE *)(vm + i); \ 1596 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1597 } \ 1598 } 1599 1600 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1601 { 1602 uint64_t res = 0; 1603 int db, rb = 0; 1604 1605 for (db = 0; db < n; ++db) { 1606 if ((mask >> db) & 1) { 1607 res |= ((data >> db) & 1) << rb; 1608 ++rb; 1609 } 1610 } 1611 return res; 1612 } 1613 1614 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1615 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1616 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1617 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1618 1619 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1620 { 1621 uint64_t res = 0; 1622 int rb, db = 0; 1623 1624 for (rb = 0; rb < n; ++rb) { 1625 if ((mask >> rb) & 1) { 1626 res |= ((data >> db) & 1) << rb; 1627 ++db; 1628 } 1629 } 1630 return res; 1631 } 1632 1633 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1634 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1635 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1636 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1637 1638 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1639 { 1640 uint64_t resm = 0, resu = 0; 1641 int db, rbm = 0, rbu = 0; 1642 1643 for (db = 0; db < n; ++db) { 1644 uint64_t val = (data >> db) & 1; 1645 if ((mask >> db) & 1) { 1646 resm |= val << rbm++; 1647 } else { 1648 resu |= val << rbu++; 1649 } 1650 } 1651 1652 return resm | (resu << rbm); 1653 } 1654 1655 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1656 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1657 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1658 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1659 1660 #undef DO_BITPERM 1661 1662 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1663 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1664 { \ 1665 intptr_t i, opr_sz = simd_oprsz(desc); \ 1666 int sub_r = simd_data(desc); \ 1667 if (sub_r) { \ 1668 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1669 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1670 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1671 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1672 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1673 acc_r = ADD_OP(acc_r, el2_i); \ 1674 acc_i = SUB_OP(acc_i, el2_r); \ 1675 *(TYPE *)(vd + H(i)) = acc_r; \ 1676 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1677 } \ 1678 } else { \ 1679 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1680 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1681 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1682 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1683 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1684 acc_r = SUB_OP(acc_r, el2_i); \ 1685 acc_i = ADD_OP(acc_i, el2_r); \ 1686 *(TYPE *)(vd + H(i)) = acc_r; \ 1687 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1688 } \ 1689 } \ 1690 } 1691 1692 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1693 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1694 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1695 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1696 1697 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1698 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1699 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1700 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1701 1702 #undef DO_CADD 1703 1704 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1705 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1706 { \ 1707 intptr_t i, opr_sz = simd_oprsz(desc); \ 1708 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1709 int shift = simd_data(desc) >> 1; \ 1710 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1711 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1712 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1713 } \ 1714 } 1715 1716 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1717 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1718 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1719 1720 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1721 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1722 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1723 1724 #undef DO_ZZI_SHLL 1725 1726 /* Two-operand reduction expander, controlled by a predicate. 1727 * The difference between TYPERED and TYPERET has to do with 1728 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1729 * but TYPERET must be unsigned so that e.g. a 32-bit value 1730 * is not sign-extended to the ABI uint64_t return type. 1731 */ 1732 /* ??? If we were to vectorize this by hand the reduction ordering 1733 * would change. For integer operands, this is perfectly fine. 1734 */ 1735 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1736 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1737 { \ 1738 intptr_t i, opr_sz = simd_oprsz(desc); \ 1739 TYPERED ret = INIT; \ 1740 for (i = 0; i < opr_sz; ) { \ 1741 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1742 do { \ 1743 if (pg & 1) { \ 1744 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1745 ret = OP(ret, nn); \ 1746 } \ 1747 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1748 } while (i & 15); \ 1749 } \ 1750 return (TYPERET)ret; \ 1751 } 1752 1753 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1754 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1755 { \ 1756 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1757 TYPEE *n = vn; \ 1758 uint8_t *pg = vg; \ 1759 TYPER ret = INIT; \ 1760 for (i = 0; i < opr_sz; i += 1) { \ 1761 if (pg[H1(i)] & 1) { \ 1762 TYPEE nn = n[i]; \ 1763 ret = OP(ret, nn); \ 1764 } \ 1765 } \ 1766 return ret; \ 1767 } 1768 1769 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1770 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1771 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1772 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1773 1774 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1775 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1776 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1777 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1778 1779 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1780 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1781 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1782 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1783 1784 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1785 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1786 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1787 1788 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1789 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1790 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1791 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1792 1793 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1794 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1795 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1796 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1797 1798 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1799 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1800 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1801 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1802 1803 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1804 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1805 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1806 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1807 1808 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1809 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1810 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1811 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1812 1813 #undef DO_VPZ 1814 #undef DO_VPZ_D 1815 1816 /* Two vector operand, one scalar operand, unpredicated. */ 1817 #define DO_ZZI(NAME, TYPE, OP) \ 1818 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1819 { \ 1820 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1821 TYPE s = s64, *d = vd, *n = vn; \ 1822 for (i = 0; i < opr_sz; ++i) { \ 1823 d[i] = OP(n[i], s); \ 1824 } \ 1825 } 1826 1827 #define DO_SUBR(X, Y) (Y - X) 1828 1829 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1830 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1831 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1832 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1833 1834 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1835 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1836 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1837 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1838 1839 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1840 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1841 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1842 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1843 1844 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1845 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1846 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1847 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1848 1849 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1850 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1851 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1852 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1853 1854 #undef DO_ZZI 1855 1856 #undef DO_AND 1857 #undef DO_ORR 1858 #undef DO_EOR 1859 #undef DO_BIC 1860 #undef DO_ADD 1861 #undef DO_SUB 1862 #undef DO_MAX 1863 #undef DO_MIN 1864 #undef DO_ABD 1865 #undef DO_MUL 1866 #undef DO_DIV 1867 #undef DO_ASR 1868 #undef DO_LSR 1869 #undef DO_LSL 1870 #undef DO_SUBR 1871 1872 /* Similar to the ARM LastActiveElement pseudocode function, except the 1873 result is multiplied by the element size. This includes the not found 1874 indication; e.g. not found for esz=3 is -8. */ 1875 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1876 { 1877 uint64_t mask = pred_esz_masks[esz]; 1878 intptr_t i = words; 1879 1880 do { 1881 uint64_t this_g = g[--i] & mask; 1882 if (this_g) { 1883 return i * 64 + (63 - clz64(this_g)); 1884 } 1885 } while (i > 0); 1886 return (intptr_t)-1 << esz; 1887 } 1888 1889 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1890 { 1891 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1892 uint32_t flags = PREDTEST_INIT; 1893 uint64_t *d = vd, *g = vg; 1894 intptr_t i = 0; 1895 1896 do { 1897 uint64_t this_d = d[i]; 1898 uint64_t this_g = g[i]; 1899 1900 if (this_g) { 1901 if (!(flags & 4)) { 1902 /* Set in D the first bit of G. */ 1903 this_d |= this_g & -this_g; 1904 d[i] = this_d; 1905 } 1906 flags = iter_predtest_fwd(this_d, this_g, flags); 1907 } 1908 } while (++i < words); 1909 1910 return flags; 1911 } 1912 1913 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1914 { 1915 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1916 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1917 uint32_t flags = PREDTEST_INIT; 1918 uint64_t *d = vd, *g = vg, esz_mask; 1919 intptr_t i, next; 1920 1921 next = last_active_element(vd, words, esz) + (1 << esz); 1922 esz_mask = pred_esz_masks[esz]; 1923 1924 /* Similar to the pseudocode for pnext, but scaled by ESZ 1925 so that we find the correct bit. */ 1926 if (next < words * 64) { 1927 uint64_t mask = -1; 1928 1929 if (next & 63) { 1930 mask = ~((1ull << (next & 63)) - 1); 1931 next &= -64; 1932 } 1933 do { 1934 uint64_t this_g = g[next / 64] & esz_mask & mask; 1935 if (this_g != 0) { 1936 next = (next & -64) + ctz64(this_g); 1937 break; 1938 } 1939 next += 64; 1940 mask = -1; 1941 } while (next < words * 64); 1942 } 1943 1944 i = 0; 1945 do { 1946 uint64_t this_d = 0; 1947 if (i == next / 64) { 1948 this_d = 1ull << (next & 63); 1949 } 1950 d[i] = this_d; 1951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1952 } while (++i < words); 1953 1954 return flags; 1955 } 1956 1957 /* 1958 * Copy Zn into Zd, and store zero into inactive elements. 1959 * If inv, store zeros into the active elements. 1960 */ 1961 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1962 { 1963 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1964 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1965 uint64_t *d = vd, *n = vn; 1966 uint8_t *pg = vg; 1967 1968 for (i = 0; i < opr_sz; i += 1) { 1969 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1970 } 1971 } 1972 1973 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1974 { 1975 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1976 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1977 uint64_t *d = vd, *n = vn; 1978 uint8_t *pg = vg; 1979 1980 for (i = 0; i < opr_sz; i += 1) { 1981 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1982 } 1983 } 1984 1985 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 1986 { 1987 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1988 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1989 uint64_t *d = vd, *n = vn; 1990 uint8_t *pg = vg; 1991 1992 for (i = 0; i < opr_sz; i += 1) { 1993 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 1994 } 1995 } 1996 1997 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 1998 { 1999 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2000 uint64_t *d = vd, *n = vn; 2001 uint8_t *pg = vg; 2002 uint8_t inv = simd_data(desc); 2003 2004 for (i = 0; i < opr_sz; i += 1) { 2005 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2006 } 2007 } 2008 2009 /* Three-operand expander, immediate operand, controlled by a predicate. 2010 */ 2011 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2012 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2013 { \ 2014 intptr_t i, opr_sz = simd_oprsz(desc); \ 2015 TYPE imm = simd_data(desc); \ 2016 for (i = 0; i < opr_sz; ) { \ 2017 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2018 do { \ 2019 if (pg & 1) { \ 2020 TYPE nn = *(TYPE *)(vn + H(i)); \ 2021 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2022 } \ 2023 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2024 } while (i & 15); \ 2025 } \ 2026 } 2027 2028 /* Similarly, specialized for 64-bit operands. */ 2029 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2030 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2031 { \ 2032 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2033 TYPE *d = vd, *n = vn; \ 2034 TYPE imm = simd_data(desc); \ 2035 uint8_t *pg = vg; \ 2036 for (i = 0; i < opr_sz; i += 1) { \ 2037 if (pg[H1(i)] & 1) { \ 2038 TYPE nn = n[i]; \ 2039 d[i] = OP(nn, imm); \ 2040 } \ 2041 } \ 2042 } 2043 2044 #define DO_SHR(N, M) (N >> M) 2045 #define DO_SHL(N, M) (N << M) 2046 2047 /* Arithmetic shift right for division. This rounds negative numbers 2048 toward zero as per signed division. Therefore before shifting, 2049 when N is negative, add 2**M-1. */ 2050 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2051 2052 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2053 { 2054 if (likely(sh < 64)) { 2055 return (x >> sh) + ((x >> (sh - 1)) & 1); 2056 } else if (sh == 64) { 2057 return x >> 63; 2058 } else { 2059 return 0; 2060 } 2061 } 2062 2063 static inline int64_t do_srshr(int64_t x, unsigned sh) 2064 { 2065 if (likely(sh < 64)) { 2066 return (x >> sh) + ((x >> (sh - 1)) & 1); 2067 } else { 2068 /* Rounding the sign bit always produces 0. */ 2069 return 0; 2070 } 2071 } 2072 2073 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2074 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2075 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2076 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2077 2078 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2079 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2080 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2081 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2082 2083 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2084 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2085 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2086 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2087 2088 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2089 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2090 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2091 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2092 2093 /* SVE2 bitwise shift by immediate */ 2094 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2095 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2096 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2097 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2098 2099 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2100 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2101 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2102 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2103 2104 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2105 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2106 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2107 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2108 2109 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2110 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2111 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2112 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2113 2114 #define do_suqrshl_b(n, m) \ 2115 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2116 #define do_suqrshl_h(n, m) \ 2117 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2118 #define do_suqrshl_s(n, m) \ 2119 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2120 #define do_suqrshl_d(n, m) \ 2121 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2122 2123 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2124 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2125 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2126 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2127 2128 #undef DO_ASRD 2129 #undef DO_ZPZI 2130 #undef DO_ZPZI_D 2131 2132 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2133 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2134 { \ 2135 intptr_t i, opr_sz = simd_oprsz(desc); \ 2136 int shift = simd_data(desc); \ 2137 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2138 TYPEW nn = *(TYPEW *)(vn + i); \ 2139 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2140 } \ 2141 } 2142 2143 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2144 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2145 { \ 2146 intptr_t i, opr_sz = simd_oprsz(desc); \ 2147 int shift = simd_data(desc); \ 2148 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2149 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2150 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2151 } \ 2152 } 2153 2154 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2155 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2156 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2157 2158 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2159 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2160 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2161 2162 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2163 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2164 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2165 2166 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2167 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2168 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2169 2170 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2171 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2172 #define DO_SQSHRUN_D(x, sh) \ 2173 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2174 2175 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2176 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2177 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2178 2179 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2180 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2181 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2182 2183 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2184 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2185 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2186 2187 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2188 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2189 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2190 2191 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2192 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2193 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2194 2195 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2196 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2197 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2198 2199 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2200 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2201 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2202 2203 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2204 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2205 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2206 2207 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2208 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2209 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2210 2211 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2212 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2213 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2214 2215 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2216 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2217 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2218 2219 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2220 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2221 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2222 2223 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2224 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2225 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2226 2227 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2228 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2229 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2230 2231 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2232 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2233 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2234 2235 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2236 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2237 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2238 2239 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2240 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2241 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2242 2243 #undef DO_SHRNB 2244 #undef DO_SHRNT 2245 2246 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2247 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2248 { \ 2249 intptr_t i, opr_sz = simd_oprsz(desc); \ 2250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2251 TYPEW nn = *(TYPEW *)(vn + i); \ 2252 TYPEW mm = *(TYPEW *)(vm + i); \ 2253 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2254 } \ 2255 } 2256 2257 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2258 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2259 { \ 2260 intptr_t i, opr_sz = simd_oprsz(desc); \ 2261 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2262 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2263 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2264 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2265 } \ 2266 } 2267 2268 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2269 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2270 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2271 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2272 2273 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2274 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2275 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2276 2277 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2278 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2279 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2280 2281 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2282 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2283 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2284 2285 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2286 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2287 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2288 2289 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2290 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2291 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2292 2293 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2294 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2295 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2296 2297 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2298 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2299 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2300 2301 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2302 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2303 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2304 2305 #undef DO_RSUBHN 2306 #undef DO_SUBHN 2307 #undef DO_RADDHN 2308 #undef DO_ADDHN 2309 2310 #undef DO_BINOPNB 2311 2312 /* Fully general four-operand expander, controlled by a predicate. 2313 */ 2314 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2315 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2316 void *vg, uint32_t desc) \ 2317 { \ 2318 intptr_t i, opr_sz = simd_oprsz(desc); \ 2319 for (i = 0; i < opr_sz; ) { \ 2320 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2321 do { \ 2322 if (pg & 1) { \ 2323 TYPE nn = *(TYPE *)(vn + H(i)); \ 2324 TYPE mm = *(TYPE *)(vm + H(i)); \ 2325 TYPE aa = *(TYPE *)(va + H(i)); \ 2326 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2327 } \ 2328 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2329 } while (i & 15); \ 2330 } \ 2331 } 2332 2333 /* Similarly, specialized for 64-bit operands. */ 2334 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2335 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2336 void *vg, uint32_t desc) \ 2337 { \ 2338 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2339 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2340 uint8_t *pg = vg; \ 2341 for (i = 0; i < opr_sz; i += 1) { \ 2342 if (pg[H1(i)] & 1) { \ 2343 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2344 d[i] = OP(aa, nn, mm); \ 2345 } \ 2346 } \ 2347 } 2348 2349 #define DO_MLA(A, N, M) (A + N * M) 2350 #define DO_MLS(A, N, M) (A - N * M) 2351 2352 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2353 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2354 2355 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2356 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2357 2358 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2359 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2360 2361 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2362 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2363 2364 #undef DO_MLA 2365 #undef DO_MLS 2366 #undef DO_ZPZZZ 2367 #undef DO_ZPZZZ_D 2368 2369 void HELPER(sve_index_b)(void *vd, uint32_t start, 2370 uint32_t incr, uint32_t desc) 2371 { 2372 intptr_t i, opr_sz = simd_oprsz(desc); 2373 uint8_t *d = vd; 2374 for (i = 0; i < opr_sz; i += 1) { 2375 d[H1(i)] = start + i * incr; 2376 } 2377 } 2378 2379 void HELPER(sve_index_h)(void *vd, uint32_t start, 2380 uint32_t incr, uint32_t desc) 2381 { 2382 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2383 uint16_t *d = vd; 2384 for (i = 0; i < opr_sz; i += 1) { 2385 d[H2(i)] = start + i * incr; 2386 } 2387 } 2388 2389 void HELPER(sve_index_s)(void *vd, uint32_t start, 2390 uint32_t incr, uint32_t desc) 2391 { 2392 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2393 uint32_t *d = vd; 2394 for (i = 0; i < opr_sz; i += 1) { 2395 d[H4(i)] = start + i * incr; 2396 } 2397 } 2398 2399 void HELPER(sve_index_d)(void *vd, uint64_t start, 2400 uint64_t incr, uint32_t desc) 2401 { 2402 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2403 uint64_t *d = vd; 2404 for (i = 0; i < opr_sz; i += 1) { 2405 d[i] = start + i * incr; 2406 } 2407 } 2408 2409 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2410 { 2411 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2412 uint32_t sh = simd_data(desc); 2413 uint32_t *d = vd, *n = vn, *m = vm; 2414 for (i = 0; i < opr_sz; i += 1) { 2415 d[i] = n[i] + (m[i] << sh); 2416 } 2417 } 2418 2419 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2420 { 2421 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2422 uint64_t sh = simd_data(desc); 2423 uint64_t *d = vd, *n = vn, *m = vm; 2424 for (i = 0; i < opr_sz; i += 1) { 2425 d[i] = n[i] + (m[i] << sh); 2426 } 2427 } 2428 2429 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2430 { 2431 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2432 uint64_t sh = simd_data(desc); 2433 uint64_t *d = vd, *n = vn, *m = vm; 2434 for (i = 0; i < opr_sz; i += 1) { 2435 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2436 } 2437 } 2438 2439 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2440 { 2441 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2442 uint64_t sh = simd_data(desc); 2443 uint64_t *d = vd, *n = vn, *m = vm; 2444 for (i = 0; i < opr_sz; i += 1) { 2445 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2446 } 2447 } 2448 2449 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2450 { 2451 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2452 static const uint16_t coeff[] = { 2453 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2454 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2455 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2456 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2457 }; 2458 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2459 uint16_t *d = vd, *n = vn; 2460 2461 for (i = 0; i < opr_sz; i++) { 2462 uint16_t nn = n[i]; 2463 intptr_t idx = extract32(nn, 0, 5); 2464 uint16_t exp = extract32(nn, 5, 5); 2465 d[i] = coeff[idx] | (exp << 10); 2466 } 2467 } 2468 2469 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2470 { 2471 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2472 static const uint32_t coeff[] = { 2473 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2474 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2475 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2476 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2477 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2478 0x1ef532, 0x20b051, 0x227043, 0x243516, 2479 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2480 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2481 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2482 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2483 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2484 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2485 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2486 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2487 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2488 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2489 }; 2490 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2491 uint32_t *d = vd, *n = vn; 2492 2493 for (i = 0; i < opr_sz; i++) { 2494 uint32_t nn = n[i]; 2495 intptr_t idx = extract32(nn, 0, 6); 2496 uint32_t exp = extract32(nn, 6, 8); 2497 d[i] = coeff[idx] | (exp << 23); 2498 } 2499 } 2500 2501 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2502 { 2503 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2504 static const uint64_t coeff[] = { 2505 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2506 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2507 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2508 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2509 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2510 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2511 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2512 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2513 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2514 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2515 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2516 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2517 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2518 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2519 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2520 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2521 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2522 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2523 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2524 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2525 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2526 0xFA7C1819E90D8ull, 2527 }; 2528 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2529 uint64_t *d = vd, *n = vn; 2530 2531 for (i = 0; i < opr_sz; i++) { 2532 uint64_t nn = n[i]; 2533 intptr_t idx = extract32(nn, 0, 6); 2534 uint64_t exp = extract32(nn, 6, 11); 2535 d[i] = coeff[idx] | (exp << 52); 2536 } 2537 } 2538 2539 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2540 { 2541 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2542 uint16_t *d = vd, *n = vn, *m = vm; 2543 for (i = 0; i < opr_sz; i += 1) { 2544 uint16_t nn = n[i]; 2545 uint16_t mm = m[i]; 2546 if (mm & 1) { 2547 nn = float16_one; 2548 } 2549 d[i] = nn ^ (mm & 2) << 14; 2550 } 2551 } 2552 2553 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2554 { 2555 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2556 uint32_t *d = vd, *n = vn, *m = vm; 2557 for (i = 0; i < opr_sz; i += 1) { 2558 uint32_t nn = n[i]; 2559 uint32_t mm = m[i]; 2560 if (mm & 1) { 2561 nn = float32_one; 2562 } 2563 d[i] = nn ^ (mm & 2) << 30; 2564 } 2565 } 2566 2567 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2568 { 2569 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2570 uint64_t *d = vd, *n = vn, *m = vm; 2571 for (i = 0; i < opr_sz; i += 1) { 2572 uint64_t nn = n[i]; 2573 uint64_t mm = m[i]; 2574 if (mm & 1) { 2575 nn = float64_one; 2576 } 2577 d[i] = nn ^ (mm & 2) << 62; 2578 } 2579 } 2580 2581 /* 2582 * Signed saturating addition with scalar operand. 2583 */ 2584 2585 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2586 { 2587 intptr_t i, oprsz = simd_oprsz(desc); 2588 2589 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2590 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2591 } 2592 } 2593 2594 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2595 { 2596 intptr_t i, oprsz = simd_oprsz(desc); 2597 2598 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2599 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2600 } 2601 } 2602 2603 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2604 { 2605 intptr_t i, oprsz = simd_oprsz(desc); 2606 2607 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2608 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2609 } 2610 } 2611 2612 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2613 { 2614 intptr_t i, oprsz = simd_oprsz(desc); 2615 2616 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2617 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2618 } 2619 } 2620 2621 /* 2622 * Unsigned saturating addition with scalar operand. 2623 */ 2624 2625 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2626 { 2627 intptr_t i, oprsz = simd_oprsz(desc); 2628 2629 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2630 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2631 } 2632 } 2633 2634 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2635 { 2636 intptr_t i, oprsz = simd_oprsz(desc); 2637 2638 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2639 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2640 } 2641 } 2642 2643 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2644 { 2645 intptr_t i, oprsz = simd_oprsz(desc); 2646 2647 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2648 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2649 } 2650 } 2651 2652 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2653 { 2654 intptr_t i, oprsz = simd_oprsz(desc); 2655 2656 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2657 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2658 } 2659 } 2660 2661 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2662 { 2663 intptr_t i, oprsz = simd_oprsz(desc); 2664 2665 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2666 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2667 } 2668 } 2669 2670 /* Two operand predicated copy immediate with merge. All valid immediates 2671 * can fit within 17 signed bits in the simd_data field. 2672 */ 2673 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2674 uint64_t mm, uint32_t desc) 2675 { 2676 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2677 uint64_t *d = vd, *n = vn; 2678 uint8_t *pg = vg; 2679 2680 mm = dup_const(MO_8, mm); 2681 for (i = 0; i < opr_sz; i += 1) { 2682 uint64_t nn = n[i]; 2683 uint64_t pp = expand_pred_b(pg[H1(i)]); 2684 d[i] = (mm & pp) | (nn & ~pp); 2685 } 2686 } 2687 2688 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2689 uint64_t mm, uint32_t desc) 2690 { 2691 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2692 uint64_t *d = vd, *n = vn; 2693 uint8_t *pg = vg; 2694 2695 mm = dup_const(MO_16, mm); 2696 for (i = 0; i < opr_sz; i += 1) { 2697 uint64_t nn = n[i]; 2698 uint64_t pp = expand_pred_h(pg[H1(i)]); 2699 d[i] = (mm & pp) | (nn & ~pp); 2700 } 2701 } 2702 2703 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2704 uint64_t mm, uint32_t desc) 2705 { 2706 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2707 uint64_t *d = vd, *n = vn; 2708 uint8_t *pg = vg; 2709 2710 mm = dup_const(MO_32, mm); 2711 for (i = 0; i < opr_sz; i += 1) { 2712 uint64_t nn = n[i]; 2713 uint64_t pp = expand_pred_s(pg[H1(i)]); 2714 d[i] = (mm & pp) | (nn & ~pp); 2715 } 2716 } 2717 2718 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2719 uint64_t mm, uint32_t desc) 2720 { 2721 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2722 uint64_t *d = vd, *n = vn; 2723 uint8_t *pg = vg; 2724 2725 for (i = 0; i < opr_sz; i += 1) { 2726 uint64_t nn = n[i]; 2727 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2728 } 2729 } 2730 2731 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2732 { 2733 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2734 uint64_t *d = vd; 2735 uint8_t *pg = vg; 2736 2737 val = dup_const(MO_8, val); 2738 for (i = 0; i < opr_sz; i += 1) { 2739 d[i] = val & expand_pred_b(pg[H1(i)]); 2740 } 2741 } 2742 2743 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2744 { 2745 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2746 uint64_t *d = vd; 2747 uint8_t *pg = vg; 2748 2749 val = dup_const(MO_16, val); 2750 for (i = 0; i < opr_sz; i += 1) { 2751 d[i] = val & expand_pred_h(pg[H1(i)]); 2752 } 2753 } 2754 2755 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2756 { 2757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2758 uint64_t *d = vd; 2759 uint8_t *pg = vg; 2760 2761 val = dup_const(MO_32, val); 2762 for (i = 0; i < opr_sz; i += 1) { 2763 d[i] = val & expand_pred_s(pg[H1(i)]); 2764 } 2765 } 2766 2767 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2768 { 2769 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2770 uint64_t *d = vd; 2771 uint8_t *pg = vg; 2772 2773 for (i = 0; i < opr_sz; i += 1) { 2774 d[i] = (pg[H1(i)] & 1 ? val : 0); 2775 } 2776 } 2777 2778 /* Big-endian hosts need to frob the byte indices. If the copy 2779 * happens to be 8-byte aligned, then no frobbing necessary. 2780 */ 2781 static void swap_memmove(void *vd, void *vs, size_t n) 2782 { 2783 uintptr_t d = (uintptr_t)vd; 2784 uintptr_t s = (uintptr_t)vs; 2785 uintptr_t o = (d | s | n) & 7; 2786 size_t i; 2787 2788 #if !HOST_BIG_ENDIAN 2789 o = 0; 2790 #endif 2791 switch (o) { 2792 case 0: 2793 memmove(vd, vs, n); 2794 break; 2795 2796 case 4: 2797 if (d < s || d >= s + n) { 2798 for (i = 0; i < n; i += 4) { 2799 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2800 } 2801 } else { 2802 for (i = n; i > 0; ) { 2803 i -= 4; 2804 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2805 } 2806 } 2807 break; 2808 2809 case 2: 2810 case 6: 2811 if (d < s || d >= s + n) { 2812 for (i = 0; i < n; i += 2) { 2813 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2814 } 2815 } else { 2816 for (i = n; i > 0; ) { 2817 i -= 2; 2818 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2819 } 2820 } 2821 break; 2822 2823 default: 2824 if (d < s || d >= s + n) { 2825 for (i = 0; i < n; i++) { 2826 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2827 } 2828 } else { 2829 for (i = n; i > 0; ) { 2830 i -= 1; 2831 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2832 } 2833 } 2834 break; 2835 } 2836 } 2837 2838 /* Similarly for memset of 0. */ 2839 static void swap_memzero(void *vd, size_t n) 2840 { 2841 uintptr_t d = (uintptr_t)vd; 2842 uintptr_t o = (d | n) & 7; 2843 size_t i; 2844 2845 /* Usually, the first bit of a predicate is set, so N is 0. */ 2846 if (likely(n == 0)) { 2847 return; 2848 } 2849 2850 #if !HOST_BIG_ENDIAN 2851 o = 0; 2852 #endif 2853 switch (o) { 2854 case 0: 2855 memset(vd, 0, n); 2856 break; 2857 2858 case 4: 2859 for (i = 0; i < n; i += 4) { 2860 *(uint32_t *)H1_4(d + i) = 0; 2861 } 2862 break; 2863 2864 case 2: 2865 case 6: 2866 for (i = 0; i < n; i += 2) { 2867 *(uint16_t *)H1_2(d + i) = 0; 2868 } 2869 break; 2870 2871 default: 2872 for (i = 0; i < n; i++) { 2873 *(uint8_t *)H1(d + i) = 0; 2874 } 2875 break; 2876 } 2877 } 2878 2879 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2880 { 2881 intptr_t opr_sz = simd_oprsz(desc); 2882 size_t n_ofs = simd_data(desc); 2883 size_t n_siz = opr_sz - n_ofs; 2884 2885 if (vd != vm) { 2886 swap_memmove(vd, vn + n_ofs, n_siz); 2887 swap_memmove(vd + n_siz, vm, n_ofs); 2888 } else if (vd != vn) { 2889 swap_memmove(vd + n_siz, vd, n_ofs); 2890 swap_memmove(vd, vn + n_ofs, n_siz); 2891 } else { 2892 /* vd == vn == vm. Need temp space. */ 2893 ARMVectorReg tmp; 2894 swap_memmove(&tmp, vm, n_ofs); 2895 swap_memmove(vd, vd + n_ofs, n_siz); 2896 memcpy(vd + n_siz, &tmp, n_ofs); 2897 } 2898 } 2899 2900 #define DO_INSR(NAME, TYPE, H) \ 2901 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2902 { \ 2903 intptr_t opr_sz = simd_oprsz(desc); \ 2904 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2905 *(TYPE *)(vd + H(0)) = val; \ 2906 } 2907 2908 DO_INSR(sve_insr_b, uint8_t, H1) 2909 DO_INSR(sve_insr_h, uint16_t, H1_2) 2910 DO_INSR(sve_insr_s, uint32_t, H1_4) 2911 DO_INSR(sve_insr_d, uint64_t, H1_8) 2912 2913 #undef DO_INSR 2914 2915 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2916 { 2917 intptr_t i, j, opr_sz = simd_oprsz(desc); 2918 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2919 uint64_t f = *(uint64_t *)(vn + i); 2920 uint64_t b = *(uint64_t *)(vn + j); 2921 *(uint64_t *)(vd + i) = bswap64(b); 2922 *(uint64_t *)(vd + j) = bswap64(f); 2923 } 2924 } 2925 2926 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2927 { 2928 intptr_t i, j, opr_sz = simd_oprsz(desc); 2929 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2930 uint64_t f = *(uint64_t *)(vn + i); 2931 uint64_t b = *(uint64_t *)(vn + j); 2932 *(uint64_t *)(vd + i) = hswap64(b); 2933 *(uint64_t *)(vd + j) = hswap64(f); 2934 } 2935 } 2936 2937 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2938 { 2939 intptr_t i, j, opr_sz = simd_oprsz(desc); 2940 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2941 uint64_t f = *(uint64_t *)(vn + i); 2942 uint64_t b = *(uint64_t *)(vn + j); 2943 *(uint64_t *)(vd + i) = rol64(b, 32); 2944 *(uint64_t *)(vd + j) = rol64(f, 32); 2945 } 2946 } 2947 2948 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2949 { 2950 intptr_t i, j, opr_sz = simd_oprsz(desc); 2951 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2952 uint64_t f = *(uint64_t *)(vn + i); 2953 uint64_t b = *(uint64_t *)(vn + j); 2954 *(uint64_t *)(vd + i) = b; 2955 *(uint64_t *)(vd + j) = f; 2956 } 2957 } 2958 2959 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2960 2961 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2962 bool is_tbx, tb_impl_fn *fn) 2963 { 2964 ARMVectorReg scratch; 2965 uintptr_t oprsz = simd_oprsz(desc); 2966 2967 if (unlikely(vd == vn)) { 2968 vn = memcpy(&scratch, vn, oprsz); 2969 } 2970 2971 fn(vd, vn, NULL, vm, oprsz, is_tbx); 2972 } 2973 2974 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 2975 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 2976 { 2977 ARMVectorReg scratch; 2978 uintptr_t oprsz = simd_oprsz(desc); 2979 2980 if (unlikely(vd == vn0)) { 2981 vn0 = memcpy(&scratch, vn0, oprsz); 2982 if (vd == vn1) { 2983 vn1 = vn0; 2984 } 2985 } else if (unlikely(vd == vn1)) { 2986 vn1 = memcpy(&scratch, vn1, oprsz); 2987 } 2988 2989 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 2990 } 2991 2992 #define DO_TB(SUFF, TYPE, H) \ 2993 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 2994 void *vm, uintptr_t oprsz, bool is_tbx) \ 2995 { \ 2996 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 2997 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 2998 for (i = 0; i < nelem; ++i) { \ 2999 TYPE index = indexes[H1(i)], val = 0; \ 3000 if (index < nelem) { \ 3001 val = tbl0[H(index)]; \ 3002 } else { \ 3003 index -= nelem; \ 3004 if (tbl1 && index < nelem) { \ 3005 val = tbl1[H(index)]; \ 3006 } else if (is_tbx) { \ 3007 continue; \ 3008 } \ 3009 } \ 3010 d[H(i)] = val; \ 3011 } \ 3012 } \ 3013 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3014 { \ 3015 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3016 } \ 3017 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3018 void *vm, uint32_t desc) \ 3019 { \ 3020 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3021 } \ 3022 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3023 { \ 3024 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3025 } 3026 3027 DO_TB(b, uint8_t, H1) 3028 DO_TB(h, uint16_t, H2) 3029 DO_TB(s, uint32_t, H4) 3030 DO_TB(d, uint64_t, H8) 3031 3032 #undef DO_TB 3033 3034 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3035 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3036 { \ 3037 intptr_t i, opr_sz = simd_oprsz(desc); \ 3038 TYPED *d = vd; \ 3039 TYPES *n = vn; \ 3040 ARMVectorReg tmp; \ 3041 if (unlikely(vn - vd < opr_sz)) { \ 3042 n = memcpy(&tmp, n, opr_sz / 2); \ 3043 } \ 3044 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3045 d[HD(i)] = n[HS(i)]; \ 3046 } \ 3047 } 3048 3049 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3050 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3051 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3052 3053 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3054 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3055 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3056 3057 #undef DO_UNPK 3058 3059 /* Mask of bits included in the even numbered predicates of width esz. 3060 * We also use this for expand_bits/compress_bits, and so extend the 3061 * same pattern out to 16-bit units. 3062 */ 3063 static const uint64_t even_bit_esz_masks[5] = { 3064 0x5555555555555555ull, 3065 0x3333333333333333ull, 3066 0x0f0f0f0f0f0f0f0full, 3067 0x00ff00ff00ff00ffull, 3068 0x0000ffff0000ffffull, 3069 }; 3070 3071 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3072 * For N==0, this corresponds to the operation that in qemu/bitops.h 3073 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3074 * section 7-2 Shuffling Bits. 3075 */ 3076 static uint64_t expand_bits(uint64_t x, int n) 3077 { 3078 int i; 3079 3080 x &= 0xffffffffu; 3081 for (i = 4; i >= n; i--) { 3082 int sh = 1 << i; 3083 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3084 } 3085 return x; 3086 } 3087 3088 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3089 * For N==0, this corresponds to the operation that in qemu/bitops.h 3090 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3091 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3092 */ 3093 static uint64_t compress_bits(uint64_t x, int n) 3094 { 3095 int i; 3096 3097 for (i = n; i <= 4; i++) { 3098 int sh = 1 << i; 3099 x &= even_bit_esz_masks[i]; 3100 x = (x >> sh) | x; 3101 } 3102 return x & 0xffffffffu; 3103 } 3104 3105 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3106 { 3107 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3108 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3109 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3110 int esize = 1 << esz; 3111 uint64_t *d = vd; 3112 intptr_t i; 3113 3114 if (oprsz <= 8) { 3115 uint64_t nn = *(uint64_t *)vn; 3116 uint64_t mm = *(uint64_t *)vm; 3117 int half = 4 * oprsz; 3118 3119 nn = extract64(nn, high * half, half); 3120 mm = extract64(mm, high * half, half); 3121 nn = expand_bits(nn, esz); 3122 mm = expand_bits(mm, esz); 3123 d[0] = nn | (mm << esize); 3124 } else { 3125 ARMPredicateReg tmp; 3126 3127 /* We produce output faster than we consume input. 3128 Therefore we must be mindful of possible overlap. */ 3129 if (vd == vn) { 3130 vn = memcpy(&tmp, vn, oprsz); 3131 if (vd == vm) { 3132 vm = vn; 3133 } 3134 } else if (vd == vm) { 3135 vm = memcpy(&tmp, vm, oprsz); 3136 } 3137 if (high) { 3138 high = oprsz >> 1; 3139 } 3140 3141 if ((oprsz & 7) == 0) { 3142 uint32_t *n = vn, *m = vm; 3143 high >>= 2; 3144 3145 for (i = 0; i < oprsz / 8; i++) { 3146 uint64_t nn = n[H4(high + i)]; 3147 uint64_t mm = m[H4(high + i)]; 3148 3149 nn = expand_bits(nn, esz); 3150 mm = expand_bits(mm, esz); 3151 d[i] = nn | (mm << esize); 3152 } 3153 } else { 3154 uint8_t *n = vn, *m = vm; 3155 uint16_t *d16 = vd; 3156 3157 for (i = 0; i < oprsz / 2; i++) { 3158 uint16_t nn = n[H1(high + i)]; 3159 uint16_t mm = m[H1(high + i)]; 3160 3161 nn = expand_bits(nn, esz); 3162 mm = expand_bits(mm, esz); 3163 d16[H2(i)] = nn | (mm << esize); 3164 } 3165 } 3166 } 3167 } 3168 3169 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3170 { 3171 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3172 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3173 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3174 uint64_t *d = vd, *n = vn, *m = vm; 3175 uint64_t l, h; 3176 intptr_t i; 3177 3178 if (oprsz <= 8) { 3179 l = compress_bits(n[0] >> odd, esz); 3180 h = compress_bits(m[0] >> odd, esz); 3181 d[0] = l | (h << (4 * oprsz)); 3182 } else { 3183 ARMPredicateReg tmp_m; 3184 intptr_t oprsz_16 = oprsz / 16; 3185 3186 if ((vm - vd) < (uintptr_t)oprsz) { 3187 m = memcpy(&tmp_m, vm, oprsz); 3188 } 3189 3190 for (i = 0; i < oprsz_16; i++) { 3191 l = n[2 * i + 0]; 3192 h = n[2 * i + 1]; 3193 l = compress_bits(l >> odd, esz); 3194 h = compress_bits(h >> odd, esz); 3195 d[i] = l | (h << 32); 3196 } 3197 3198 /* 3199 * For VL which is not a multiple of 512, the results from M do not 3200 * align nicely with the uint64_t for D. Put the aligned results 3201 * from M into TMP_M and then copy it into place afterward. 3202 */ 3203 if (oprsz & 15) { 3204 int final_shift = (oprsz & 15) * 2; 3205 3206 l = n[2 * i + 0]; 3207 h = n[2 * i + 1]; 3208 l = compress_bits(l >> odd, esz); 3209 h = compress_bits(h >> odd, esz); 3210 d[i] = l | (h << final_shift); 3211 3212 for (i = 0; i < oprsz_16; i++) { 3213 l = m[2 * i + 0]; 3214 h = m[2 * i + 1]; 3215 l = compress_bits(l >> odd, esz); 3216 h = compress_bits(h >> odd, esz); 3217 tmp_m.p[i] = l | (h << 32); 3218 } 3219 l = m[2 * i + 0]; 3220 h = m[2 * i + 1]; 3221 l = compress_bits(l >> odd, esz); 3222 h = compress_bits(h >> odd, esz); 3223 tmp_m.p[i] = l | (h << final_shift); 3224 3225 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3226 } else { 3227 for (i = 0; i < oprsz_16; i++) { 3228 l = m[2 * i + 0]; 3229 h = m[2 * i + 1]; 3230 l = compress_bits(l >> odd, esz); 3231 h = compress_bits(h >> odd, esz); 3232 d[oprsz_16 + i] = l | (h << 32); 3233 } 3234 } 3235 } 3236 } 3237 3238 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3239 { 3240 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3241 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3242 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3243 uint64_t *d = vd, *n = vn, *m = vm; 3244 uint64_t mask; 3245 int shr, shl; 3246 intptr_t i; 3247 3248 shl = 1 << esz; 3249 shr = 0; 3250 mask = even_bit_esz_masks[esz]; 3251 if (odd) { 3252 mask <<= shl; 3253 shr = shl; 3254 shl = 0; 3255 } 3256 3257 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3258 uint64_t nn = (n[i] & mask) >> shr; 3259 uint64_t mm = (m[i] & mask) << shl; 3260 d[i] = nn + mm; 3261 } 3262 } 3263 3264 /* Reverse units of 2**N bits. */ 3265 static uint64_t reverse_bits_64(uint64_t x, int n) 3266 { 3267 int i, sh; 3268 3269 x = bswap64(x); 3270 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3271 uint64_t mask = even_bit_esz_masks[i]; 3272 x = ((x & mask) << sh) | ((x >> sh) & mask); 3273 } 3274 return x; 3275 } 3276 3277 static uint8_t reverse_bits_8(uint8_t x, int n) 3278 { 3279 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3280 int i, sh; 3281 3282 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3283 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3284 } 3285 return x; 3286 } 3287 3288 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3289 { 3290 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3291 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3292 intptr_t i, oprsz_2 = oprsz / 2; 3293 3294 if (oprsz <= 8) { 3295 uint64_t l = *(uint64_t *)vn; 3296 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3297 *(uint64_t *)vd = l; 3298 } else if ((oprsz & 15) == 0) { 3299 for (i = 0; i < oprsz_2; i += 8) { 3300 intptr_t ih = oprsz - 8 - i; 3301 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3302 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3303 *(uint64_t *)(vd + i) = h; 3304 *(uint64_t *)(vd + ih) = l; 3305 } 3306 } else { 3307 for (i = 0; i < oprsz_2; i += 1) { 3308 intptr_t il = H1(i); 3309 intptr_t ih = H1(oprsz - 1 - i); 3310 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3311 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3312 *(uint8_t *)(vd + il) = h; 3313 *(uint8_t *)(vd + ih) = l; 3314 } 3315 } 3316 } 3317 3318 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3319 { 3320 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3321 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3322 uint64_t *d = vd; 3323 intptr_t i; 3324 3325 if (oprsz <= 8) { 3326 uint64_t nn = *(uint64_t *)vn; 3327 int half = 4 * oprsz; 3328 3329 nn = extract64(nn, high * half, half); 3330 nn = expand_bits(nn, 0); 3331 d[0] = nn; 3332 } else { 3333 ARMPredicateReg tmp_n; 3334 3335 /* We produce output faster than we consume input. 3336 Therefore we must be mindful of possible overlap. */ 3337 if ((vn - vd) < (uintptr_t)oprsz) { 3338 vn = memcpy(&tmp_n, vn, oprsz); 3339 } 3340 if (high) { 3341 high = oprsz >> 1; 3342 } 3343 3344 if ((oprsz & 7) == 0) { 3345 uint32_t *n = vn; 3346 high >>= 2; 3347 3348 for (i = 0; i < oprsz / 8; i++) { 3349 uint64_t nn = n[H4(high + i)]; 3350 d[i] = expand_bits(nn, 0); 3351 } 3352 } else { 3353 uint16_t *d16 = vd; 3354 uint8_t *n = vn; 3355 3356 for (i = 0; i < oprsz / 2; i++) { 3357 uint16_t nn = n[H1(high + i)]; 3358 d16[H2(i)] = expand_bits(nn, 0); 3359 } 3360 } 3361 } 3362 } 3363 3364 #define DO_ZIP(NAME, TYPE, H) \ 3365 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3366 { \ 3367 intptr_t oprsz = simd_oprsz(desc); \ 3368 intptr_t odd_ofs = simd_data(desc); \ 3369 intptr_t i, oprsz_2 = oprsz / 2; \ 3370 ARMVectorReg tmp_n, tmp_m; \ 3371 /* We produce output faster than we consume input. \ 3372 Therefore we must be mindful of possible overlap. */ \ 3373 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3374 vn = memcpy(&tmp_n, vn, oprsz); \ 3375 } \ 3376 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3377 vm = memcpy(&tmp_m, vm, oprsz); \ 3378 } \ 3379 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3380 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3381 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3382 *(TYPE *)(vm + odd_ofs + H(i)); \ 3383 } \ 3384 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3385 memset(vd + oprsz - 16, 0, 16); \ 3386 } \ 3387 } 3388 3389 DO_ZIP(sve_zip_b, uint8_t, H1) 3390 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3391 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3392 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3393 DO_ZIP(sve2_zip_q, Int128, ) 3394 3395 #define DO_UZP(NAME, TYPE, H) \ 3396 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3397 { \ 3398 intptr_t oprsz = simd_oprsz(desc); \ 3399 intptr_t odd_ofs = simd_data(desc); \ 3400 intptr_t i, p; \ 3401 ARMVectorReg tmp_m; \ 3402 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3403 vm = memcpy(&tmp_m, vm, oprsz); \ 3404 } \ 3405 i = 0, p = odd_ofs; \ 3406 do { \ 3407 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3408 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3409 } while (p < oprsz); \ 3410 p -= oprsz; \ 3411 do { \ 3412 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3413 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3414 } while (p < oprsz); \ 3415 tcg_debug_assert(i == oprsz); \ 3416 } 3417 3418 DO_UZP(sve_uzp_b, uint8_t, H1) 3419 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3420 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3421 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3422 DO_UZP(sve2_uzp_q, Int128, ) 3423 3424 #define DO_TRN(NAME, TYPE, H) \ 3425 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3426 { \ 3427 intptr_t oprsz = simd_oprsz(desc); \ 3428 intptr_t odd_ofs = simd_data(desc); \ 3429 intptr_t i; \ 3430 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3431 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3432 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3433 *(TYPE *)(vd + H(i + 0)) = ae; \ 3434 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3435 } \ 3436 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3437 memset(vd + oprsz - 16, 0, 16); \ 3438 } \ 3439 } 3440 3441 DO_TRN(sve_trn_b, uint8_t, H1) 3442 DO_TRN(sve_trn_h, uint16_t, H1_2) 3443 DO_TRN(sve_trn_s, uint32_t, H1_4) 3444 DO_TRN(sve_trn_d, uint64_t, H1_8) 3445 DO_TRN(sve2_trn_q, Int128, ) 3446 3447 #undef DO_ZIP 3448 #undef DO_UZP 3449 #undef DO_TRN 3450 3451 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3452 { 3453 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3454 uint32_t *d = vd, *n = vn; 3455 uint8_t *pg = vg; 3456 3457 for (i = j = 0; i < opr_sz; i++) { 3458 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3459 d[H4(j)] = n[H4(i)]; 3460 j++; 3461 } 3462 } 3463 for (; j < opr_sz; j++) { 3464 d[H4(j)] = 0; 3465 } 3466 } 3467 3468 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3469 { 3470 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3471 uint64_t *d = vd, *n = vn; 3472 uint8_t *pg = vg; 3473 3474 for (i = j = 0; i < opr_sz; i++) { 3475 if (pg[H1(i)] & 1) { 3476 d[j] = n[i]; 3477 j++; 3478 } 3479 } 3480 for (; j < opr_sz; j++) { 3481 d[j] = 0; 3482 } 3483 } 3484 3485 /* Similar to the ARM LastActiveElement pseudocode function, except the 3486 * result is multiplied by the element size. This includes the not found 3487 * indication; e.g. not found for esz=3 is -8. 3488 */ 3489 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3490 { 3491 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3492 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3493 3494 return last_active_element(vg, words, esz); 3495 } 3496 3497 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3498 { 3499 intptr_t opr_sz = simd_oprsz(desc) / 8; 3500 int esz = simd_data(desc); 3501 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3502 intptr_t i, first_i, last_i; 3503 ARMVectorReg tmp; 3504 3505 first_i = last_i = 0; 3506 first_g = last_g = 0; 3507 3508 /* Find the extent of the active elements within VG. */ 3509 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3510 pg = *(uint64_t *)(vg + i) & mask; 3511 if (pg) { 3512 if (last_g == 0) { 3513 last_g = pg; 3514 last_i = i; 3515 } 3516 first_g = pg; 3517 first_i = i; 3518 } 3519 } 3520 3521 len = 0; 3522 if (first_g != 0) { 3523 first_i = first_i * 8 + ctz64(first_g); 3524 last_i = last_i * 8 + 63 - clz64(last_g); 3525 len = last_i - first_i + (1 << esz); 3526 if (vd == vm) { 3527 vm = memcpy(&tmp, vm, opr_sz * 8); 3528 } 3529 swap_memmove(vd, vn + first_i, len); 3530 } 3531 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3532 } 3533 3534 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3535 void *vg, uint32_t desc) 3536 { 3537 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3538 uint64_t *d = vd, *n = vn, *m = vm; 3539 uint8_t *pg = vg; 3540 3541 for (i = 0; i < opr_sz; i += 1) { 3542 uint64_t nn = n[i], mm = m[i]; 3543 uint64_t pp = expand_pred_b(pg[H1(i)]); 3544 d[i] = (nn & pp) | (mm & ~pp); 3545 } 3546 } 3547 3548 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3549 void *vg, uint32_t desc) 3550 { 3551 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3552 uint64_t *d = vd, *n = vn, *m = vm; 3553 uint8_t *pg = vg; 3554 3555 for (i = 0; i < opr_sz; i += 1) { 3556 uint64_t nn = n[i], mm = m[i]; 3557 uint64_t pp = expand_pred_h(pg[H1(i)]); 3558 d[i] = (nn & pp) | (mm & ~pp); 3559 } 3560 } 3561 3562 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3563 void *vg, uint32_t desc) 3564 { 3565 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3566 uint64_t *d = vd, *n = vn, *m = vm; 3567 uint8_t *pg = vg; 3568 3569 for (i = 0; i < opr_sz; i += 1) { 3570 uint64_t nn = n[i], mm = m[i]; 3571 uint64_t pp = expand_pred_s(pg[H1(i)]); 3572 d[i] = (nn & pp) | (mm & ~pp); 3573 } 3574 } 3575 3576 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3577 void *vg, uint32_t desc) 3578 { 3579 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3580 uint64_t *d = vd, *n = vn, *m = vm; 3581 uint8_t *pg = vg; 3582 3583 for (i = 0; i < opr_sz; i += 1) { 3584 uint64_t nn = n[i], mm = m[i]; 3585 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3586 } 3587 } 3588 3589 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3590 void *vg, uint32_t desc) 3591 { 3592 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3593 Int128 *d = vd, *n = vn, *m = vm; 3594 uint16_t *pg = vg; 3595 3596 for (i = 0; i < opr_sz; i += 1) { 3597 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3598 } 3599 } 3600 3601 /* Two operand comparison controlled by a predicate. 3602 * ??? It is very tempting to want to be able to expand this inline 3603 * with x86 instructions, e.g. 3604 * 3605 * vcmpeqw zm, zn, %ymm0 3606 * vpmovmskb %ymm0, %eax 3607 * and $0x5555, %eax 3608 * and pg, %eax 3609 * 3610 * or even aarch64, e.g. 3611 * 3612 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3613 * cmeq v0.8h, zn, zm 3614 * and v0.8h, v0.8h, mask 3615 * addv h0, v0.8h 3616 * and v0.8b, pg 3617 * 3618 * However, coming up with an abstraction that allows vector inputs and 3619 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3620 * scalar outputs, is tricky. 3621 */ 3622 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3623 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3624 { \ 3625 intptr_t opr_sz = simd_oprsz(desc); \ 3626 uint32_t flags = PREDTEST_INIT; \ 3627 intptr_t i = opr_sz; \ 3628 do { \ 3629 uint64_t out = 0, pg; \ 3630 do { \ 3631 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3632 TYPE nn = *(TYPE *)(vn + H(i)); \ 3633 TYPE mm = *(TYPE *)(vm + H(i)); \ 3634 out |= nn OP mm; \ 3635 } while (i & 63); \ 3636 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3637 out &= pg; \ 3638 *(uint64_t *)(vd + (i >> 3)) = out; \ 3639 flags = iter_predtest_bwd(out, pg, flags); \ 3640 } while (i > 0); \ 3641 return flags; \ 3642 } 3643 3644 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3645 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3646 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3647 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3648 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3649 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3650 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3651 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3652 3653 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3654 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3655 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3656 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3657 3658 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3659 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3660 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3661 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3662 3663 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3664 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3665 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3666 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3667 3668 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3669 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3670 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3671 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3672 3673 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3674 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3675 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3676 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3677 3678 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3679 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3680 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3681 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3682 3683 #undef DO_CMP_PPZZ_B 3684 #undef DO_CMP_PPZZ_H 3685 #undef DO_CMP_PPZZ_S 3686 #undef DO_CMP_PPZZ_D 3687 #undef DO_CMP_PPZZ 3688 3689 /* Similar, but the second source is "wide". */ 3690 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3691 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3692 { \ 3693 intptr_t opr_sz = simd_oprsz(desc); \ 3694 uint32_t flags = PREDTEST_INIT; \ 3695 intptr_t i = opr_sz; \ 3696 do { \ 3697 uint64_t out = 0, pg; \ 3698 do { \ 3699 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3700 do { \ 3701 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3702 TYPE nn = *(TYPE *)(vn + H(i)); \ 3703 out |= nn OP mm; \ 3704 } while (i & 7); \ 3705 } while (i & 63); \ 3706 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3707 out &= pg; \ 3708 *(uint64_t *)(vd + (i >> 3)) = out; \ 3709 flags = iter_predtest_bwd(out, pg, flags); \ 3710 } while (i > 0); \ 3711 return flags; \ 3712 } 3713 3714 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3715 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3716 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3717 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3718 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3719 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3720 3721 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3722 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3723 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3724 3725 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3726 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3727 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3728 3729 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3730 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3731 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3732 3733 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3734 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3735 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3736 3737 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3738 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3739 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3740 3741 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3742 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3743 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3744 3745 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3746 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3747 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3748 3749 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3750 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3751 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3752 3753 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3754 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3755 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3756 3757 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3758 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3759 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3760 3761 #undef DO_CMP_PPZW_B 3762 #undef DO_CMP_PPZW_H 3763 #undef DO_CMP_PPZW_S 3764 #undef DO_CMP_PPZW 3765 3766 /* Similar, but the second source is immediate. */ 3767 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3768 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3769 { \ 3770 intptr_t opr_sz = simd_oprsz(desc); \ 3771 uint32_t flags = PREDTEST_INIT; \ 3772 TYPE mm = simd_data(desc); \ 3773 intptr_t i = opr_sz; \ 3774 do { \ 3775 uint64_t out = 0, pg; \ 3776 do { \ 3777 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3778 TYPE nn = *(TYPE *)(vn + H(i)); \ 3779 out |= nn OP mm; \ 3780 } while (i & 63); \ 3781 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3782 out &= pg; \ 3783 *(uint64_t *)(vd + (i >> 3)) = out; \ 3784 flags = iter_predtest_bwd(out, pg, flags); \ 3785 } while (i > 0); \ 3786 return flags; \ 3787 } 3788 3789 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3790 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3791 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3792 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3793 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3794 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3795 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3796 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3797 3798 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3799 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3800 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3801 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3802 3803 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3804 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3805 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3806 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3807 3808 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3809 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3810 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3811 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3812 3813 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3814 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3815 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3816 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3817 3818 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3819 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3820 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3821 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3822 3823 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3824 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3825 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3826 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3827 3828 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3829 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3830 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3831 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3832 3833 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3834 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3835 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3836 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3837 3838 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3839 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3840 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3841 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3842 3843 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3844 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3845 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3846 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3847 3848 #undef DO_CMP_PPZI_B 3849 #undef DO_CMP_PPZI_H 3850 #undef DO_CMP_PPZI_S 3851 #undef DO_CMP_PPZI_D 3852 #undef DO_CMP_PPZI 3853 3854 /* Similar to the ARM LastActive pseudocode function. */ 3855 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3856 { 3857 intptr_t i; 3858 3859 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3860 uint64_t pg = *(uint64_t *)(vg + i); 3861 if (pg) { 3862 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3863 } 3864 } 3865 return 0; 3866 } 3867 3868 /* Compute a mask into RETB that is true for all G, up to and including 3869 * (if after) or excluding (if !after) the first G & N. 3870 * Return true if BRK found. 3871 */ 3872 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3873 bool brk, bool after) 3874 { 3875 uint64_t b; 3876 3877 if (brk) { 3878 b = 0; 3879 } else if ((g & n) == 0) { 3880 /* For all G, no N are set; break not found. */ 3881 b = g; 3882 } else { 3883 /* Break somewhere in N. Locate it. */ 3884 b = g & n; /* guard true, pred true */ 3885 b = b & -b; /* first such */ 3886 if (after) { 3887 b = b | (b - 1); /* break after same */ 3888 } else { 3889 b = b - 1; /* break before same */ 3890 } 3891 brk = true; 3892 } 3893 3894 *retb = b; 3895 return brk; 3896 } 3897 3898 /* Compute a zeroing BRK. */ 3899 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3900 intptr_t oprsz, bool after) 3901 { 3902 bool brk = false; 3903 intptr_t i; 3904 3905 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3906 uint64_t this_b, this_g = g[i]; 3907 3908 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3909 d[i] = this_b & this_g; 3910 } 3911 } 3912 3913 /* Likewise, but also compute flags. */ 3914 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3915 intptr_t oprsz, bool after) 3916 { 3917 uint32_t flags = PREDTEST_INIT; 3918 bool brk = false; 3919 intptr_t i; 3920 3921 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3922 uint64_t this_b, this_d, this_g = g[i]; 3923 3924 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3925 d[i] = this_d = this_b & this_g; 3926 flags = iter_predtest_fwd(this_d, this_g, flags); 3927 } 3928 return flags; 3929 } 3930 3931 /* Compute a merging BRK. */ 3932 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3933 intptr_t oprsz, bool after) 3934 { 3935 bool brk = false; 3936 intptr_t i; 3937 3938 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3939 uint64_t this_b, this_g = g[i]; 3940 3941 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3942 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3943 } 3944 } 3945 3946 /* Likewise, but also compute flags. */ 3947 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3948 intptr_t oprsz, bool after) 3949 { 3950 uint32_t flags = PREDTEST_INIT; 3951 bool brk = false; 3952 intptr_t i; 3953 3954 for (i = 0; i < oprsz / 8; ++i) { 3955 uint64_t this_b, this_d = d[i], this_g = g[i]; 3956 3957 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3958 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3959 flags = iter_predtest_fwd(this_d, this_g, flags); 3960 } 3961 return flags; 3962 } 3963 3964 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3965 { 3966 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3967 * The compiler should turn this into 4 64-bit integer stores. 3968 */ 3969 memset(d, 0, sizeof(ARMPredicateReg)); 3970 return PREDTEST_INIT; 3971 } 3972 3973 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 3974 uint32_t pred_desc) 3975 { 3976 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3977 if (last_active_pred(vn, vg, oprsz)) { 3978 compute_brk_z(vd, vm, vg, oprsz, true); 3979 } else { 3980 do_zero(vd, oprsz); 3981 } 3982 } 3983 3984 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 3985 uint32_t pred_desc) 3986 { 3987 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3988 if (last_active_pred(vn, vg, oprsz)) { 3989 return compute_brks_z(vd, vm, vg, oprsz, true); 3990 } else { 3991 return do_zero(vd, oprsz); 3992 } 3993 } 3994 3995 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 3996 uint32_t pred_desc) 3997 { 3998 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3999 if (last_active_pred(vn, vg, oprsz)) { 4000 compute_brk_z(vd, vm, vg, oprsz, false); 4001 } else { 4002 do_zero(vd, oprsz); 4003 } 4004 } 4005 4006 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4007 uint32_t pred_desc) 4008 { 4009 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4010 if (last_active_pred(vn, vg, oprsz)) { 4011 return compute_brks_z(vd, vm, vg, oprsz, false); 4012 } else { 4013 return do_zero(vd, oprsz); 4014 } 4015 } 4016 4017 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4018 { 4019 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4020 compute_brk_z(vd, vn, vg, oprsz, true); 4021 } 4022 4023 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4024 { 4025 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4026 return compute_brks_z(vd, vn, vg, oprsz, true); 4027 } 4028 4029 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4030 { 4031 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4032 compute_brk_z(vd, vn, vg, oprsz, false); 4033 } 4034 4035 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4036 { 4037 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4038 return compute_brks_z(vd, vn, vg, oprsz, false); 4039 } 4040 4041 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4042 { 4043 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4044 compute_brk_m(vd, vn, vg, oprsz, true); 4045 } 4046 4047 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4048 { 4049 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4050 return compute_brks_m(vd, vn, vg, oprsz, true); 4051 } 4052 4053 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4054 { 4055 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4056 compute_brk_m(vd, vn, vg, oprsz, false); 4057 } 4058 4059 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4060 { 4061 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4062 return compute_brks_m(vd, vn, vg, oprsz, false); 4063 } 4064 4065 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4066 { 4067 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4068 if (!last_active_pred(vn, vg, oprsz)) { 4069 do_zero(vd, oprsz); 4070 } 4071 } 4072 4073 /* As if PredTest(Ones(PL), D, esz). */ 4074 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4075 uint64_t esz_mask) 4076 { 4077 uint32_t flags = PREDTEST_INIT; 4078 intptr_t i; 4079 4080 for (i = 0; i < oprsz / 8; i++) { 4081 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4082 } 4083 if (oprsz & 7) { 4084 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4085 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4086 } 4087 return flags; 4088 } 4089 4090 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4091 { 4092 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4093 if (last_active_pred(vn, vg, oprsz)) { 4094 return predtest_ones(vd, oprsz, -1); 4095 } else { 4096 return do_zero(vd, oprsz); 4097 } 4098 } 4099 4100 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4101 { 4102 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4103 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4104 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4105 intptr_t i; 4106 4107 for (i = 0; i < words; ++i) { 4108 uint64_t t = n[i] & g[i] & mask; 4109 sum += ctpop64(t); 4110 } 4111 return sum; 4112 } 4113 4114 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4115 { 4116 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4117 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4118 uint64_t esz_mask = pred_esz_masks[esz]; 4119 ARMPredicateReg *d = vd; 4120 uint32_t flags; 4121 intptr_t i; 4122 4123 /* Begin with a zero predicate register. */ 4124 flags = do_zero(d, oprsz); 4125 if (count == 0) { 4126 return flags; 4127 } 4128 4129 /* Set all of the requested bits. */ 4130 for (i = 0; i < count / 64; ++i) { 4131 d->p[i] = esz_mask; 4132 } 4133 if (count & 63) { 4134 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4135 } 4136 4137 return predtest_ones(d, oprsz, esz_mask); 4138 } 4139 4140 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4141 { 4142 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4143 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4144 uint64_t esz_mask = pred_esz_masks[esz]; 4145 ARMPredicateReg *d = vd; 4146 intptr_t i, invcount, oprbits; 4147 uint64_t bits; 4148 4149 if (count == 0) { 4150 return do_zero(d, oprsz); 4151 } 4152 4153 oprbits = oprsz * 8; 4154 tcg_debug_assert(count <= oprbits); 4155 4156 bits = esz_mask; 4157 if (oprbits & 63) { 4158 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4159 } 4160 4161 invcount = oprbits - count; 4162 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4163 d->p[i] = bits; 4164 bits = esz_mask; 4165 } 4166 4167 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4168 4169 while (--i >= 0) { 4170 d->p[i] = 0; 4171 } 4172 4173 return predtest_ones(d, oprsz, esz_mask); 4174 } 4175 4176 /* Recursive reduction on a function; 4177 * C.f. the ARM ARM function ReducePredicated. 4178 * 4179 * While it would be possible to write this without the DATA temporary, 4180 * it is much simpler to process the predicate register this way. 4181 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4182 * little to gain with a more complex non-recursive form. 4183 */ 4184 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4185 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4186 { \ 4187 if (n == 1) { \ 4188 return *data; \ 4189 } else { \ 4190 uintptr_t half = n / 2; \ 4191 TYPE lo = NAME##_reduce(data, status, half); \ 4192 TYPE hi = NAME##_reduce(data + half, status, half); \ 4193 return TYPE##_##FUNC(lo, hi, status); \ 4194 } \ 4195 } \ 4196 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ 4197 { \ 4198 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4199 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4200 for (i = 0; i < oprsz; ) { \ 4201 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4202 do { \ 4203 TYPE nn = *(TYPE *)(vn + H(i)); \ 4204 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4205 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4206 } while (i & 15); \ 4207 } \ 4208 for (; i < maxsz; i += sizeof(TYPE)) { \ 4209 *(TYPE *)((void *)data + i) = IDENT; \ 4210 } \ 4211 return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ 4212 } 4213 4214 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) 4215 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) 4216 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) 4217 4218 /* Identity is floatN_default_nan, without the function call. */ 4219 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) 4220 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) 4221 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) 4222 4223 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) 4224 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) 4225 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) 4226 4227 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) 4228 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) 4229 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) 4230 4231 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) 4232 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) 4233 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) 4234 4235 #undef DO_REDUCE 4236 4237 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4238 float_status *status, uint32_t desc) 4239 { 4240 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4241 float16 result = nn; 4242 4243 do { 4244 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4245 do { 4246 if (pg & 1) { 4247 float16 mm = *(float16 *)(vm + H1_2(i)); 4248 result = float16_add(result, mm, status); 4249 } 4250 i += sizeof(float16), pg >>= sizeof(float16); 4251 } while (i & 15); 4252 } while (i < opr_sz); 4253 4254 return result; 4255 } 4256 4257 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4258 float_status *status, uint32_t desc) 4259 { 4260 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4261 float32 result = nn; 4262 4263 do { 4264 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4265 do { 4266 if (pg & 1) { 4267 float32 mm = *(float32 *)(vm + H1_2(i)); 4268 result = float32_add(result, mm, status); 4269 } 4270 i += sizeof(float32), pg >>= sizeof(float32); 4271 } while (i & 15); 4272 } while (i < opr_sz); 4273 4274 return result; 4275 } 4276 4277 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4278 float_status *status, uint32_t desc) 4279 { 4280 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4281 uint64_t *m = vm; 4282 uint8_t *pg = vg; 4283 4284 for (i = 0; i < opr_sz; i++) { 4285 if (pg[H1(i)] & 1) { 4286 nn = float64_add(nn, m[i], status); 4287 } 4288 } 4289 4290 return nn; 4291 } 4292 4293 /* Fully general three-operand expander, controlled by a predicate, 4294 * With the extra float_status parameter. 4295 */ 4296 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4297 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4298 float_status *status, uint32_t desc) \ 4299 { \ 4300 intptr_t i = simd_oprsz(desc); \ 4301 uint64_t *g = vg; \ 4302 do { \ 4303 uint64_t pg = g[(i - 1) >> 6]; \ 4304 do { \ 4305 i -= sizeof(TYPE); \ 4306 if (likely((pg >> (i & 63)) & 1)) { \ 4307 TYPE nn = *(TYPE *)(vn + H(i)); \ 4308 TYPE mm = *(TYPE *)(vm + H(i)); \ 4309 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4310 } \ 4311 } while (i & 63); \ 4312 } while (i != 0); \ 4313 } 4314 4315 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4316 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4317 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4318 4319 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4320 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4321 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4322 4323 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4324 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4325 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4326 4327 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4328 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4329 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4330 4331 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4332 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4333 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4334 4335 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4336 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4337 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4338 4339 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4340 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4341 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4342 4343 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4344 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4345 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4346 4347 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4348 { 4349 return float16_abs(float16_sub(a, b, s)); 4350 } 4351 4352 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4353 { 4354 return float32_abs(float32_sub(a, b, s)); 4355 } 4356 4357 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4358 { 4359 return float64_abs(float64_sub(a, b, s)); 4360 } 4361 4362 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4363 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4364 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4365 4366 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4367 { 4368 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4369 return float64_scalbn(a, b_int, s); 4370 } 4371 4372 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4373 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4374 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4375 4376 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4377 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4378 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4379 4380 #undef DO_ZPZZ_FP 4381 4382 /* Three-operand expander, with one scalar operand, controlled by 4383 * a predicate, with the extra float_status parameter. 4384 */ 4385 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4386 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4387 float_status *status, uint32_t desc) \ 4388 { \ 4389 intptr_t i = simd_oprsz(desc); \ 4390 uint64_t *g = vg; \ 4391 TYPE mm = scalar; \ 4392 do { \ 4393 uint64_t pg = g[(i - 1) >> 6]; \ 4394 do { \ 4395 i -= sizeof(TYPE); \ 4396 if (likely((pg >> (i & 63)) & 1)) { \ 4397 TYPE nn = *(TYPE *)(vn + H(i)); \ 4398 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4399 } \ 4400 } while (i & 63); \ 4401 } while (i != 0); \ 4402 } 4403 4404 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4405 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4406 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4407 4408 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4409 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4410 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4411 4412 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4413 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4414 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4415 4416 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4417 { 4418 return float16_sub(b, a, s); 4419 } 4420 4421 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4422 { 4423 return float32_sub(b, a, s); 4424 } 4425 4426 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4427 { 4428 return float64_sub(b, a, s); 4429 } 4430 4431 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4432 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4433 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4434 4435 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4436 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4437 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4438 4439 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4440 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4441 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4442 4443 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4444 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4445 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4446 4447 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4448 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4449 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4450 4451 /* Fully general two-operand expander, controlled by a predicate, 4452 * With the extra float_status parameter. 4453 */ 4454 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4455 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4456 float_status *status, uint32_t desc) \ 4457 { \ 4458 intptr_t i = simd_oprsz(desc); \ 4459 uint64_t *g = vg; \ 4460 do { \ 4461 uint64_t pg = g[(i - 1) >> 6]; \ 4462 do { \ 4463 i -= sizeof(TYPE); \ 4464 if (likely((pg >> (i & 63)) & 1)) { \ 4465 TYPE nn = *(TYPE *)(vn + H(i)); \ 4466 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4467 } \ 4468 } while (i & 63); \ 4469 } while (i != 0); \ 4470 } 4471 4472 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4473 * FZ16. When converting from fp16, this affects flushing input denormals; 4474 * when converting to fp16, this affects flushing output denormals. 4475 */ 4476 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4477 { 4478 bool save = get_flush_inputs_to_zero(fpst); 4479 float32 ret; 4480 4481 set_flush_inputs_to_zero(false, fpst); 4482 ret = float16_to_float32(f, true, fpst); 4483 set_flush_inputs_to_zero(save, fpst); 4484 return ret; 4485 } 4486 4487 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4488 { 4489 bool save = get_flush_inputs_to_zero(fpst); 4490 float64 ret; 4491 4492 set_flush_inputs_to_zero(false, fpst); 4493 ret = float16_to_float64(f, true, fpst); 4494 set_flush_inputs_to_zero(save, fpst); 4495 return ret; 4496 } 4497 4498 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4499 { 4500 bool save = get_flush_to_zero(fpst); 4501 float16 ret; 4502 4503 set_flush_to_zero(false, fpst); 4504 ret = float32_to_float16(f, true, fpst); 4505 set_flush_to_zero(save, fpst); 4506 return ret; 4507 } 4508 4509 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4510 { 4511 bool save = get_flush_to_zero(fpst); 4512 float16 ret; 4513 4514 set_flush_to_zero(false, fpst); 4515 ret = float64_to_float16(f, true, fpst); 4516 set_flush_to_zero(save, fpst); 4517 return ret; 4518 } 4519 4520 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4521 { 4522 if (float16_is_any_nan(f)) { 4523 float_raise(float_flag_invalid, s); 4524 return 0; 4525 } 4526 return float16_to_int16_round_to_zero(f, s); 4527 } 4528 4529 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4530 { 4531 if (float16_is_any_nan(f)) { 4532 float_raise(float_flag_invalid, s); 4533 return 0; 4534 } 4535 return float16_to_int64_round_to_zero(f, s); 4536 } 4537 4538 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4539 { 4540 if (float32_is_any_nan(f)) { 4541 float_raise(float_flag_invalid, s); 4542 return 0; 4543 } 4544 return float32_to_int64_round_to_zero(f, s); 4545 } 4546 4547 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4548 { 4549 if (float64_is_any_nan(f)) { 4550 float_raise(float_flag_invalid, s); 4551 return 0; 4552 } 4553 return float64_to_int64_round_to_zero(f, s); 4554 } 4555 4556 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4557 { 4558 if (float16_is_any_nan(f)) { 4559 float_raise(float_flag_invalid, s); 4560 return 0; 4561 } 4562 return float16_to_uint16_round_to_zero(f, s); 4563 } 4564 4565 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4566 { 4567 if (float16_is_any_nan(f)) { 4568 float_raise(float_flag_invalid, s); 4569 return 0; 4570 } 4571 return float16_to_uint64_round_to_zero(f, s); 4572 } 4573 4574 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4575 { 4576 if (float32_is_any_nan(f)) { 4577 float_raise(float_flag_invalid, s); 4578 return 0; 4579 } 4580 return float32_to_uint64_round_to_zero(f, s); 4581 } 4582 4583 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4584 { 4585 if (float64_is_any_nan(f)) { 4586 float_raise(float_flag_invalid, s); 4587 return 0; 4588 } 4589 return float64_to_uint64_round_to_zero(f, s); 4590 } 4591 4592 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4593 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4594 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4595 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4596 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4597 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4598 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4599 4600 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4601 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4602 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4603 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4604 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4605 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4606 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4607 4608 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4609 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4610 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4611 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4612 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4613 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4614 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4615 4616 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4617 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4618 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4619 4620 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4621 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4622 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4623 4624 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4625 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4626 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4627 4628 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4629 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4630 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4631 4632 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4633 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4634 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4635 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4636 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4637 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4638 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4639 4640 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4641 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4642 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4643 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4644 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4645 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4646 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4647 4648 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4649 { 4650 /* Extract frac to the top of the uint32_t. */ 4651 uint32_t frac = (uint32_t)a << (16 + 6); 4652 int16_t exp = extract32(a, 10, 5); 4653 4654 if (unlikely(exp == 0)) { 4655 if (frac != 0) { 4656 if (!get_flush_inputs_to_zero(s)) { 4657 /* denormal: bias - fractional_zeros */ 4658 return -15 - clz32(frac); 4659 } 4660 /* flush to zero */ 4661 float_raise(float_flag_input_denormal, s); 4662 } 4663 } else if (unlikely(exp == 0x1f)) { 4664 if (frac == 0) { 4665 return INT16_MAX; /* infinity */ 4666 } 4667 } else { 4668 /* normal: exp - bias */ 4669 return exp - 15; 4670 } 4671 /* nan or zero */ 4672 float_raise(float_flag_invalid, s); 4673 return INT16_MIN; 4674 } 4675 4676 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4677 { 4678 /* Extract frac to the top of the uint32_t. */ 4679 uint32_t frac = a << 9; 4680 int32_t exp = extract32(a, 23, 8); 4681 4682 if (unlikely(exp == 0)) { 4683 if (frac != 0) { 4684 if (!get_flush_inputs_to_zero(s)) { 4685 /* denormal: bias - fractional_zeros */ 4686 return -127 - clz32(frac); 4687 } 4688 /* flush to zero */ 4689 float_raise(float_flag_input_denormal, s); 4690 } 4691 } else if (unlikely(exp == 0xff)) { 4692 if (frac == 0) { 4693 return INT32_MAX; /* infinity */ 4694 } 4695 } else { 4696 /* normal: exp - bias */ 4697 return exp - 127; 4698 } 4699 /* nan or zero */ 4700 float_raise(float_flag_invalid, s); 4701 return INT32_MIN; 4702 } 4703 4704 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4705 { 4706 /* Extract frac to the top of the uint64_t. */ 4707 uint64_t frac = a << 12; 4708 int64_t exp = extract64(a, 52, 11); 4709 4710 if (unlikely(exp == 0)) { 4711 if (frac != 0) { 4712 if (!get_flush_inputs_to_zero(s)) { 4713 /* denormal: bias - fractional_zeros */ 4714 return -1023 - clz64(frac); 4715 } 4716 /* flush to zero */ 4717 float_raise(float_flag_input_denormal, s); 4718 } 4719 } else if (unlikely(exp == 0x7ff)) { 4720 if (frac == 0) { 4721 return INT64_MAX; /* infinity */ 4722 } 4723 } else { 4724 /* normal: exp - bias */ 4725 return exp - 1023; 4726 } 4727 /* nan or zero */ 4728 float_raise(float_flag_invalid, s); 4729 return INT64_MIN; 4730 } 4731 4732 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4733 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4734 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4735 4736 #undef DO_ZPZ_FP 4737 4738 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4739 float_status *status, uint32_t desc, 4740 uint16_t neg1, uint16_t neg3) 4741 { 4742 intptr_t i = simd_oprsz(desc); 4743 uint64_t *g = vg; 4744 4745 do { 4746 uint64_t pg = g[(i - 1) >> 6]; 4747 do { 4748 i -= 2; 4749 if (likely((pg >> (i & 63)) & 1)) { 4750 float16 e1, e2, e3, r; 4751 4752 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4753 e2 = *(uint16_t *)(vm + H1_2(i)); 4754 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4755 r = float16_muladd(e1, e2, e3, 0, status); 4756 *(uint16_t *)(vd + H1_2(i)) = r; 4757 } 4758 } while (i & 63); 4759 } while (i != 0); 4760 } 4761 4762 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4763 void *vg, float_status *status, uint32_t desc) 4764 { 4765 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); 4766 } 4767 4768 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4769 void *vg, float_status *status, uint32_t desc) 4770 { 4771 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); 4772 } 4773 4774 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4775 void *vg, float_status *status, uint32_t desc) 4776 { 4777 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); 4778 } 4779 4780 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4781 void *vg, float_status *status, uint32_t desc) 4782 { 4783 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); 4784 } 4785 4786 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4787 float_status *status, uint32_t desc, 4788 uint32_t neg1, uint32_t neg3) 4789 { 4790 intptr_t i = simd_oprsz(desc); 4791 uint64_t *g = vg; 4792 4793 do { 4794 uint64_t pg = g[(i - 1) >> 6]; 4795 do { 4796 i -= 4; 4797 if (likely((pg >> (i & 63)) & 1)) { 4798 float32 e1, e2, e3, r; 4799 4800 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4801 e2 = *(uint32_t *)(vm + H1_4(i)); 4802 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4803 r = float32_muladd(e1, e2, e3, 0, status); 4804 *(uint32_t *)(vd + H1_4(i)) = r; 4805 } 4806 } while (i & 63); 4807 } while (i != 0); 4808 } 4809 4810 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4811 void *vg, float_status *status, uint32_t desc) 4812 { 4813 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); 4814 } 4815 4816 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4817 void *vg, float_status *status, uint32_t desc) 4818 { 4819 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); 4820 } 4821 4822 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4823 void *vg, float_status *status, uint32_t desc) 4824 { 4825 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); 4826 } 4827 4828 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4829 void *vg, float_status *status, uint32_t desc) 4830 { 4831 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); 4832 } 4833 4834 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4835 float_status *status, uint32_t desc, 4836 uint64_t neg1, uint64_t neg3) 4837 { 4838 intptr_t i = simd_oprsz(desc); 4839 uint64_t *g = vg; 4840 4841 do { 4842 uint64_t pg = g[(i - 1) >> 6]; 4843 do { 4844 i -= 8; 4845 if (likely((pg >> (i & 63)) & 1)) { 4846 float64 e1, e2, e3, r; 4847 4848 e1 = *(uint64_t *)(vn + i) ^ neg1; 4849 e2 = *(uint64_t *)(vm + i); 4850 e3 = *(uint64_t *)(va + i) ^ neg3; 4851 r = float64_muladd(e1, e2, e3, 0, status); 4852 *(uint64_t *)(vd + i) = r; 4853 } 4854 } while (i & 63); 4855 } while (i != 0); 4856 } 4857 4858 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4859 void *vg, float_status *status, uint32_t desc) 4860 { 4861 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); 4862 } 4863 4864 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4865 void *vg, float_status *status, uint32_t desc) 4866 { 4867 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); 4868 } 4869 4870 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4871 void *vg, float_status *status, uint32_t desc) 4872 { 4873 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); 4874 } 4875 4876 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4877 void *vg, float_status *status, uint32_t desc) 4878 { 4879 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); 4880 } 4881 4882 /* Two operand floating-point comparison controlled by a predicate. 4883 * Unlike the integer version, we are not allowed to optimistically 4884 * compare operands, since the comparison may have side effects wrt 4885 * the FPSR. 4886 */ 4887 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 4888 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4889 float_status *status, uint32_t desc) \ 4890 { \ 4891 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4892 uint64_t *d = vd, *g = vg; \ 4893 do { \ 4894 uint64_t out = 0, pg = g[j]; \ 4895 do { \ 4896 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4897 if (likely((pg >> (i & 63)) & 1)) { \ 4898 TYPE nn = *(TYPE *)(vn + H(i)); \ 4899 TYPE mm = *(TYPE *)(vm + H(i)); \ 4900 out |= OP(TYPE, nn, mm, status); \ 4901 } \ 4902 } while (i & 63); \ 4903 d[j--] = out; \ 4904 } while (i > 0); \ 4905 } 4906 4907 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 4908 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 4909 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 4910 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 4911 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 4912 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 4913 4914 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 4915 DO_FPCMP_PPZZ_H(NAME, OP) \ 4916 DO_FPCMP_PPZZ_S(NAME, OP) \ 4917 DO_FPCMP_PPZZ_D(NAME, OP) 4918 4919 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 4920 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 4921 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 4922 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 4923 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 4924 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 4925 #define DO_FCMUO(TYPE, X, Y, ST) \ 4926 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 4927 #define DO_FACGE(TYPE, X, Y, ST) \ 4928 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 4929 #define DO_FACGT(TYPE, X, Y, ST) \ 4930 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 4931 4932 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 4933 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 4934 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 4935 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 4936 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 4937 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 4938 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 4939 4940 #undef DO_FPCMP_PPZZ_ALL 4941 #undef DO_FPCMP_PPZZ_D 4942 #undef DO_FPCMP_PPZZ_S 4943 #undef DO_FPCMP_PPZZ_H 4944 #undef DO_FPCMP_PPZZ 4945 4946 /* One operand floating-point comparison against zero, controlled 4947 * by a predicate. 4948 */ 4949 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 4950 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4951 float_status *status, uint32_t desc) \ 4952 { \ 4953 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 4954 uint64_t *d = vd, *g = vg; \ 4955 do { \ 4956 uint64_t out = 0, pg = g[j]; \ 4957 do { \ 4958 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 4959 if ((pg >> (i & 63)) & 1) { \ 4960 TYPE nn = *(TYPE *)(vn + H(i)); \ 4961 out |= OP(TYPE, nn, 0, status); \ 4962 } \ 4963 } while (i & 63); \ 4964 d[j--] = out; \ 4965 } while (i > 0); \ 4966 } 4967 4968 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 4969 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 4970 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 4971 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 4972 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 4973 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 4974 4975 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 4976 DO_FPCMP_PPZ0_H(NAME, OP) \ 4977 DO_FPCMP_PPZ0_S(NAME, OP) \ 4978 DO_FPCMP_PPZ0_D(NAME, OP) 4979 4980 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 4981 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 4982 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 4983 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 4984 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 4985 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 4986 4987 /* FP Trig Multiply-Add. */ 4988 4989 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 4990 float_status *s, uint32_t desc) 4991 { 4992 static const float16 coeff[16] = { 4993 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4994 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 4995 }; 4996 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 4997 intptr_t x = simd_data(desc); 4998 float16 *d = vd, *n = vn, *m = vm; 4999 for (i = 0; i < opr_sz; i++) { 5000 float16 mm = m[i]; 5001 intptr_t xx = x; 5002 if (float16_is_neg(mm)) { 5003 mm = float16_abs(mm); 5004 xx += 8; 5005 } 5006 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, s); 5007 } 5008 } 5009 5010 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5011 float_status *s, uint32_t desc) 5012 { 5013 static const float32 coeff[16] = { 5014 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5015 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5016 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5017 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5018 }; 5019 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5020 intptr_t x = simd_data(desc); 5021 float32 *d = vd, *n = vn, *m = vm; 5022 for (i = 0; i < opr_sz; i++) { 5023 float32 mm = m[i]; 5024 intptr_t xx = x; 5025 if (float32_is_neg(mm)) { 5026 mm = float32_abs(mm); 5027 xx += 8; 5028 } 5029 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, s); 5030 } 5031 } 5032 5033 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5034 float_status *s, uint32_t desc) 5035 { 5036 static const float64 coeff[16] = { 5037 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5038 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5039 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5040 0x3de5d8408868552full, 0x0000000000000000ull, 5041 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5042 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5043 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5044 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5045 }; 5046 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5047 intptr_t x = simd_data(desc); 5048 float64 *d = vd, *n = vn, *m = vm; 5049 for (i = 0; i < opr_sz; i++) { 5050 float64 mm = m[i]; 5051 intptr_t xx = x; 5052 if (float64_is_neg(mm)) { 5053 mm = float64_abs(mm); 5054 xx += 8; 5055 } 5056 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, s); 5057 } 5058 } 5059 5060 /* 5061 * FP Complex Add 5062 */ 5063 5064 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5065 float_status *s, uint32_t desc) 5066 { 5067 intptr_t j, i = simd_oprsz(desc); 5068 uint64_t *g = vg; 5069 float16 neg_imag = float16_set_sign(0, simd_data(desc)); 5070 float16 neg_real = float16_chs(neg_imag); 5071 5072 do { 5073 uint64_t pg = g[(i - 1) >> 6]; 5074 do { 5075 float16 e0, e1, e2, e3; 5076 5077 /* I holds the real index; J holds the imag index. */ 5078 j = i - sizeof(float16); 5079 i -= 2 * sizeof(float16); 5080 5081 e0 = *(float16 *)(vn + H1_2(i)); 5082 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; 5083 e2 = *(float16 *)(vn + H1_2(j)); 5084 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; 5085 5086 if (likely((pg >> (i & 63)) & 1)) { 5087 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5088 } 5089 if (likely((pg >> (j & 63)) & 1)) { 5090 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5091 } 5092 } while (i & 63); 5093 } while (i != 0); 5094 } 5095 5096 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5097 float_status *s, uint32_t desc) 5098 { 5099 intptr_t j, i = simd_oprsz(desc); 5100 uint64_t *g = vg; 5101 float32 neg_imag = float32_set_sign(0, simd_data(desc)); 5102 float32 neg_real = float32_chs(neg_imag); 5103 5104 do { 5105 uint64_t pg = g[(i - 1) >> 6]; 5106 do { 5107 float32 e0, e1, e2, e3; 5108 5109 /* I holds the real index; J holds the imag index. */ 5110 j = i - sizeof(float32); 5111 i -= 2 * sizeof(float32); 5112 5113 e0 = *(float32 *)(vn + H1_2(i)); 5114 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; 5115 e2 = *(float32 *)(vn + H1_2(j)); 5116 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; 5117 5118 if (likely((pg >> (i & 63)) & 1)) { 5119 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5120 } 5121 if (likely((pg >> (j & 63)) & 1)) { 5122 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5123 } 5124 } while (i & 63); 5125 } while (i != 0); 5126 } 5127 5128 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5129 float_status *s, uint32_t desc) 5130 { 5131 intptr_t j, i = simd_oprsz(desc); 5132 uint64_t *g = vg; 5133 float64 neg_imag = float64_set_sign(0, simd_data(desc)); 5134 float64 neg_real = float64_chs(neg_imag); 5135 5136 do { 5137 uint64_t pg = g[(i - 1) >> 6]; 5138 do { 5139 float64 e0, e1, e2, e3; 5140 5141 /* I holds the real index; J holds the imag index. */ 5142 j = i - sizeof(float64); 5143 i -= 2 * sizeof(float64); 5144 5145 e0 = *(float64 *)(vn + H1_2(i)); 5146 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; 5147 e2 = *(float64 *)(vn + H1_2(j)); 5148 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; 5149 5150 if (likely((pg >> (i & 63)) & 1)) { 5151 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5152 } 5153 if (likely((pg >> (j & 63)) & 1)) { 5154 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5155 } 5156 } while (i & 63); 5157 } while (i != 0); 5158 } 5159 5160 /* 5161 * FP Complex Multiply 5162 */ 5163 5164 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5165 void *vg, float_status *status, uint32_t desc) 5166 { 5167 intptr_t j, i = simd_oprsz(desc); 5168 unsigned rot = simd_data(desc); 5169 bool flip = rot & 1; 5170 float16 neg_imag, neg_real; 5171 uint64_t *g = vg; 5172 5173 neg_imag = float16_set_sign(0, (rot & 2) != 0); 5174 neg_real = float16_set_sign(0, rot == 1 || rot == 2); 5175 5176 do { 5177 uint64_t pg = g[(i - 1) >> 6]; 5178 do { 5179 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5180 5181 /* I holds the real index; J holds the imag index. */ 5182 j = i - sizeof(float16); 5183 i -= 2 * sizeof(float16); 5184 5185 nr = *(float16 *)(vn + H1_2(i)); 5186 ni = *(float16 *)(vn + H1_2(j)); 5187 mr = *(float16 *)(vm + H1_2(i)); 5188 mi = *(float16 *)(vm + H1_2(j)); 5189 5190 e2 = (flip ? ni : nr); 5191 e1 = (flip ? mi : mr) ^ neg_real; 5192 e4 = e2; 5193 e3 = (flip ? mr : mi) ^ neg_imag; 5194 5195 if (likely((pg >> (i & 63)) & 1)) { 5196 d = *(float16 *)(va + H1_2(i)); 5197 d = float16_muladd(e2, e1, d, 0, status); 5198 *(float16 *)(vd + H1_2(i)) = d; 5199 } 5200 if (likely((pg >> (j & 63)) & 1)) { 5201 d = *(float16 *)(va + H1_2(j)); 5202 d = float16_muladd(e4, e3, d, 0, status); 5203 *(float16 *)(vd + H1_2(j)) = d; 5204 } 5205 } while (i & 63); 5206 } while (i != 0); 5207 } 5208 5209 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5210 void *vg, float_status *status, uint32_t desc) 5211 { 5212 intptr_t j, i = simd_oprsz(desc); 5213 unsigned rot = simd_data(desc); 5214 bool flip = rot & 1; 5215 float32 neg_imag, neg_real; 5216 uint64_t *g = vg; 5217 5218 neg_imag = float32_set_sign(0, (rot & 2) != 0); 5219 neg_real = float32_set_sign(0, rot == 1 || rot == 2); 5220 5221 do { 5222 uint64_t pg = g[(i - 1) >> 6]; 5223 do { 5224 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5225 5226 /* I holds the real index; J holds the imag index. */ 5227 j = i - sizeof(float32); 5228 i -= 2 * sizeof(float32); 5229 5230 nr = *(float32 *)(vn + H1_2(i)); 5231 ni = *(float32 *)(vn + H1_2(j)); 5232 mr = *(float32 *)(vm + H1_2(i)); 5233 mi = *(float32 *)(vm + H1_2(j)); 5234 5235 e2 = (flip ? ni : nr); 5236 e1 = (flip ? mi : mr) ^ neg_real; 5237 e4 = e2; 5238 e3 = (flip ? mr : mi) ^ neg_imag; 5239 5240 if (likely((pg >> (i & 63)) & 1)) { 5241 d = *(float32 *)(va + H1_2(i)); 5242 d = float32_muladd(e2, e1, d, 0, status); 5243 *(float32 *)(vd + H1_2(i)) = d; 5244 } 5245 if (likely((pg >> (j & 63)) & 1)) { 5246 d = *(float32 *)(va + H1_2(j)); 5247 d = float32_muladd(e4, e3, d, 0, status); 5248 *(float32 *)(vd + H1_2(j)) = d; 5249 } 5250 } while (i & 63); 5251 } while (i != 0); 5252 } 5253 5254 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5255 void *vg, float_status *status, uint32_t desc) 5256 { 5257 intptr_t j, i = simd_oprsz(desc); 5258 unsigned rot = simd_data(desc); 5259 bool flip = rot & 1; 5260 float64 neg_imag, neg_real; 5261 uint64_t *g = vg; 5262 5263 neg_imag = float64_set_sign(0, (rot & 2) != 0); 5264 neg_real = float64_set_sign(0, rot == 1 || rot == 2); 5265 5266 do { 5267 uint64_t pg = g[(i - 1) >> 6]; 5268 do { 5269 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5270 5271 /* I holds the real index; J holds the imag index. */ 5272 j = i - sizeof(float64); 5273 i -= 2 * sizeof(float64); 5274 5275 nr = *(float64 *)(vn + H1_2(i)); 5276 ni = *(float64 *)(vn + H1_2(j)); 5277 mr = *(float64 *)(vm + H1_2(i)); 5278 mi = *(float64 *)(vm + H1_2(j)); 5279 5280 e2 = (flip ? ni : nr); 5281 e1 = (flip ? mi : mr) ^ neg_real; 5282 e4 = e2; 5283 e3 = (flip ? mr : mi) ^ neg_imag; 5284 5285 if (likely((pg >> (i & 63)) & 1)) { 5286 d = *(float64 *)(va + H1_2(i)); 5287 d = float64_muladd(e2, e1, d, 0, status); 5288 *(float64 *)(vd + H1_2(i)) = d; 5289 } 5290 if (likely((pg >> (j & 63)) & 1)) { 5291 d = *(float64 *)(va + H1_2(j)); 5292 d = float64_muladd(e4, e3, d, 0, status); 5293 *(float64 *)(vd + H1_2(j)) = d; 5294 } 5295 } while (i & 63); 5296 } while (i != 0); 5297 } 5298 5299 /* 5300 * Load contiguous data, protected by a governing predicate. 5301 */ 5302 5303 /* 5304 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5305 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5306 * element >= @reg_off, or @reg_max if there were no active elements at all. 5307 */ 5308 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5309 intptr_t reg_max, int esz) 5310 { 5311 uint64_t pg_mask = pred_esz_masks[esz]; 5312 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5313 5314 /* In normal usage, the first element is active. */ 5315 if (likely(pg & 1)) { 5316 return reg_off; 5317 } 5318 5319 if (pg == 0) { 5320 reg_off &= -64; 5321 do { 5322 reg_off += 64; 5323 if (unlikely(reg_off >= reg_max)) { 5324 /* The entire predicate was false. */ 5325 return reg_max; 5326 } 5327 pg = vg[reg_off >> 6] & pg_mask; 5328 } while (pg == 0); 5329 } 5330 reg_off += ctz64(pg); 5331 5332 /* We should never see an out of range predicate bit set. */ 5333 tcg_debug_assert(reg_off < reg_max); 5334 return reg_off; 5335 } 5336 5337 /* 5338 * Resolve the guest virtual address to info->host and info->flags. 5339 * If @nofault, return false if the page is invalid, otherwise 5340 * exit via page fault exception. 5341 */ 5342 5343 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5344 target_ulong addr, int mem_off, MMUAccessType access_type, 5345 int mmu_idx, uintptr_t retaddr) 5346 { 5347 int flags; 5348 5349 addr += mem_off; 5350 5351 /* 5352 * User-only currently always issues with TBI. See the comment 5353 * above useronly_clean_ptr. Usually we clean this top byte away 5354 * during translation, but we can't do that for e.g. vector + imm 5355 * addressing modes. 5356 * 5357 * We currently always enable TBI for user-only, and do not provide 5358 * a way to turn it off. So clean the pointer unconditionally here, 5359 * rather than look it up here, or pass it down from above. 5360 */ 5361 addr = useronly_clean_ptr(addr); 5362 5363 #ifdef CONFIG_USER_ONLY 5364 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5365 &info->host, retaddr); 5366 #else 5367 CPUTLBEntryFull *full; 5368 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5369 &info->host, &full, retaddr); 5370 #endif 5371 info->flags = flags; 5372 5373 if (flags & TLB_INVALID_MASK) { 5374 g_assert(nofault); 5375 return false; 5376 } 5377 5378 #ifdef CONFIG_USER_ONLY 5379 memset(&info->attrs, 0, sizeof(info->attrs)); 5380 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5381 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5382 #else 5383 info->attrs = full->attrs; 5384 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5385 #endif 5386 5387 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5388 info->host -= mem_off; 5389 return true; 5390 } 5391 5392 /* 5393 * Find first active element on each page, and a loose bound for the 5394 * final element on each page. Identify any single element that spans 5395 * the page boundary. Return true if there are any active elements. 5396 */ 5397 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5398 intptr_t reg_max, int esz, int msize) 5399 { 5400 const int esize = 1 << esz; 5401 const uint64_t pg_mask = pred_esz_masks[esz]; 5402 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5403 intptr_t mem_off_last, mem_off_split; 5404 intptr_t page_split, elt_split; 5405 intptr_t i; 5406 5407 /* Set all of the element indices to -1, and the TLB data to 0. */ 5408 memset(info, -1, offsetof(SVEContLdSt, page)); 5409 memset(info->page, 0, sizeof(info->page)); 5410 5411 /* Gross scan over the entire predicate to find bounds. */ 5412 i = 0; 5413 do { 5414 uint64_t pg = vg[i] & pg_mask; 5415 if (pg) { 5416 reg_off_last = i * 64 + 63 - clz64(pg); 5417 if (reg_off_first < 0) { 5418 reg_off_first = i * 64 + ctz64(pg); 5419 } 5420 } 5421 } while (++i * 64 < reg_max); 5422 5423 if (unlikely(reg_off_first < 0)) { 5424 /* No active elements, no pages touched. */ 5425 return false; 5426 } 5427 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5428 5429 info->reg_off_first[0] = reg_off_first; 5430 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5431 mem_off_last = (reg_off_last >> esz) * msize; 5432 5433 page_split = -(addr | TARGET_PAGE_MASK); 5434 if (likely(mem_off_last + msize <= page_split)) { 5435 /* The entire operation fits within a single page. */ 5436 info->reg_off_last[0] = reg_off_last; 5437 return true; 5438 } 5439 5440 info->page_split = page_split; 5441 elt_split = page_split / msize; 5442 reg_off_split = elt_split << esz; 5443 mem_off_split = elt_split * msize; 5444 5445 /* 5446 * This is the last full element on the first page, but it is not 5447 * necessarily active. If there is no full element, i.e. the first 5448 * active element is the one that's split, this value remains -1. 5449 * It is useful as iteration bounds. 5450 */ 5451 if (elt_split != 0) { 5452 info->reg_off_last[0] = reg_off_split - esize; 5453 } 5454 5455 /* Determine if an unaligned element spans the pages. */ 5456 if (page_split % msize != 0) { 5457 /* It is helpful to know if the split element is active. */ 5458 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5459 info->reg_off_split = reg_off_split; 5460 info->mem_off_split = mem_off_split; 5461 5462 if (reg_off_split == reg_off_last) { 5463 /* The page crossing element is last. */ 5464 return true; 5465 } 5466 } 5467 reg_off_split += esize; 5468 mem_off_split += msize; 5469 } 5470 5471 /* 5472 * We do want the first active element on the second page, because 5473 * this may affect the address reported in an exception. 5474 */ 5475 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5476 tcg_debug_assert(reg_off_split <= reg_off_last); 5477 info->reg_off_first[1] = reg_off_split; 5478 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5479 info->reg_off_last[1] = reg_off_last; 5480 return true; 5481 } 5482 5483 /* 5484 * Resolve the guest virtual addresses to info->page[]. 5485 * Control the generation of page faults with @fault. Return false if 5486 * there is no work to do, which can only happen with @fault == FAULT_NO. 5487 */ 5488 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5489 CPUARMState *env, target_ulong addr, 5490 MMUAccessType access_type, uintptr_t retaddr) 5491 { 5492 int mmu_idx = arm_env_mmu_index(env); 5493 int mem_off = info->mem_off_first[0]; 5494 bool nofault = fault == FAULT_NO; 5495 bool have_work = true; 5496 5497 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5498 access_type, mmu_idx, retaddr)) { 5499 /* No work to be done. */ 5500 return false; 5501 } 5502 5503 if (likely(info->page_split < 0)) { 5504 /* The entire operation was on the one page. */ 5505 return true; 5506 } 5507 5508 /* 5509 * If the second page is invalid, then we want the fault address to be 5510 * the first byte on that page which is accessed. 5511 */ 5512 if (info->mem_off_split >= 0) { 5513 /* 5514 * There is an element split across the pages. The fault address 5515 * should be the first byte of the second page. 5516 */ 5517 mem_off = info->page_split; 5518 /* 5519 * If the split element is also the first active element 5520 * of the vector, then: For first-fault we should continue 5521 * to generate faults for the second page. For no-fault, 5522 * we have work only if the second page is valid. 5523 */ 5524 if (info->mem_off_first[0] < info->mem_off_split) { 5525 nofault = FAULT_FIRST; 5526 have_work = false; 5527 } 5528 } else { 5529 /* 5530 * There is no element split across the pages. The fault address 5531 * should be the first active element on the second page. 5532 */ 5533 mem_off = info->mem_off_first[1]; 5534 /* 5535 * There must have been one active element on the first page, 5536 * so we're out of first-fault territory. 5537 */ 5538 nofault = fault != FAULT_ALL; 5539 } 5540 5541 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5542 access_type, mmu_idx, retaddr); 5543 return have_work; 5544 } 5545 5546 #ifndef CONFIG_USER_ONLY 5547 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5548 uint64_t *vg, target_ulong addr, 5549 int esize, int msize, int wp_access, 5550 uintptr_t retaddr) 5551 { 5552 intptr_t mem_off, reg_off, reg_last; 5553 int flags0 = info->page[0].flags; 5554 int flags1 = info->page[1].flags; 5555 5556 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5557 return; 5558 } 5559 5560 /* Indicate that watchpoints are handled. */ 5561 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5562 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5563 5564 if (flags0 & TLB_WATCHPOINT) { 5565 mem_off = info->mem_off_first[0]; 5566 reg_off = info->reg_off_first[0]; 5567 reg_last = info->reg_off_last[0]; 5568 5569 while (reg_off <= reg_last) { 5570 uint64_t pg = vg[reg_off >> 6]; 5571 do { 5572 if ((pg >> (reg_off & 63)) & 1) { 5573 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5574 msize, info->page[0].attrs, 5575 wp_access, retaddr); 5576 } 5577 reg_off += esize; 5578 mem_off += msize; 5579 } while (reg_off <= reg_last && (reg_off & 63)); 5580 } 5581 } 5582 5583 mem_off = info->mem_off_split; 5584 if (mem_off >= 0) { 5585 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5586 info->page[0].attrs, wp_access, retaddr); 5587 } 5588 5589 mem_off = info->mem_off_first[1]; 5590 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5591 reg_off = info->reg_off_first[1]; 5592 reg_last = info->reg_off_last[1]; 5593 5594 do { 5595 uint64_t pg = vg[reg_off >> 6]; 5596 do { 5597 if ((pg >> (reg_off & 63)) & 1) { 5598 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5599 msize, info->page[1].attrs, 5600 wp_access, retaddr); 5601 } 5602 reg_off += esize; 5603 mem_off += msize; 5604 } while (reg_off & 63); 5605 } while (reg_off <= reg_last); 5606 } 5607 } 5608 #endif 5609 5610 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5611 uint64_t *vg, target_ulong addr, int esize, 5612 int msize, uint32_t mtedesc, uintptr_t ra) 5613 { 5614 intptr_t mem_off, reg_off, reg_last; 5615 5616 /* Process the page only if MemAttr == Tagged. */ 5617 if (info->page[0].tagged) { 5618 mem_off = info->mem_off_first[0]; 5619 reg_off = info->reg_off_first[0]; 5620 reg_last = info->reg_off_split; 5621 if (reg_last < 0) { 5622 reg_last = info->reg_off_last[0]; 5623 } 5624 5625 do { 5626 uint64_t pg = vg[reg_off >> 6]; 5627 do { 5628 if ((pg >> (reg_off & 63)) & 1) { 5629 mte_check(env, mtedesc, addr, ra); 5630 } 5631 reg_off += esize; 5632 mem_off += msize; 5633 } while (reg_off <= reg_last && (reg_off & 63)); 5634 } while (reg_off <= reg_last); 5635 } 5636 5637 mem_off = info->mem_off_first[1]; 5638 if (mem_off >= 0 && info->page[1].tagged) { 5639 reg_off = info->reg_off_first[1]; 5640 reg_last = info->reg_off_last[1]; 5641 5642 do { 5643 uint64_t pg = vg[reg_off >> 6]; 5644 do { 5645 if ((pg >> (reg_off & 63)) & 1) { 5646 mte_check(env, mtedesc, addr, ra); 5647 } 5648 reg_off += esize; 5649 mem_off += msize; 5650 } while (reg_off & 63); 5651 } while (reg_off <= reg_last); 5652 } 5653 } 5654 5655 /* 5656 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5657 */ 5658 static inline QEMU_ALWAYS_INLINE 5659 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5660 uint32_t desc, const uintptr_t retaddr, 5661 const int esz, const int msz, const int N, uint32_t mtedesc, 5662 sve_ldst1_host_fn *host_fn, 5663 sve_ldst1_tlb_fn *tlb_fn) 5664 { 5665 const unsigned rd = simd_data(desc); 5666 const intptr_t reg_max = simd_oprsz(desc); 5667 intptr_t reg_off, reg_last, mem_off; 5668 SVEContLdSt info; 5669 void *host; 5670 int flags, i; 5671 5672 /* Find the active elements. */ 5673 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5674 /* The entire predicate was false; no load occurs. */ 5675 for (i = 0; i < N; ++i) { 5676 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5677 } 5678 return; 5679 } 5680 5681 /* Probe the page(s). Exit with exception for any invalid page. */ 5682 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5683 5684 /* Handle watchpoints for all active elements. */ 5685 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5686 BP_MEM_READ, retaddr); 5687 5688 /* 5689 * Handle mte checks for all active elements. 5690 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5691 */ 5692 if (mtedesc) { 5693 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5694 mtedesc, retaddr); 5695 } 5696 5697 flags = info.page[0].flags | info.page[1].flags; 5698 if (unlikely(flags != 0)) { 5699 /* 5700 * At least one page includes MMIO. 5701 * Any bus operation can fail with cpu_transaction_failed, 5702 * which for ARM will raise SyncExternal. Perform the load 5703 * into scratch memory to preserve register state until the end. 5704 */ 5705 ARMVectorReg scratch[4] = { }; 5706 5707 mem_off = info.mem_off_first[0]; 5708 reg_off = info.reg_off_first[0]; 5709 reg_last = info.reg_off_last[1]; 5710 if (reg_last < 0) { 5711 reg_last = info.reg_off_split; 5712 if (reg_last < 0) { 5713 reg_last = info.reg_off_last[0]; 5714 } 5715 } 5716 5717 do { 5718 uint64_t pg = vg[reg_off >> 6]; 5719 do { 5720 if ((pg >> (reg_off & 63)) & 1) { 5721 for (i = 0; i < N; ++i) { 5722 tlb_fn(env, &scratch[i], reg_off, 5723 addr + mem_off + (i << msz), retaddr); 5724 } 5725 } 5726 reg_off += 1 << esz; 5727 mem_off += N << msz; 5728 } while (reg_off & 63); 5729 } while (reg_off <= reg_last); 5730 5731 for (i = 0; i < N; ++i) { 5732 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5733 } 5734 return; 5735 } 5736 5737 /* The entire operation is in RAM, on valid pages. */ 5738 5739 for (i = 0; i < N; ++i) { 5740 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5741 } 5742 5743 mem_off = info.mem_off_first[0]; 5744 reg_off = info.reg_off_first[0]; 5745 reg_last = info.reg_off_last[0]; 5746 host = info.page[0].host; 5747 5748 set_helper_retaddr(retaddr); 5749 5750 while (reg_off <= reg_last) { 5751 uint64_t pg = vg[reg_off >> 6]; 5752 do { 5753 if ((pg >> (reg_off & 63)) & 1) { 5754 for (i = 0; i < N; ++i) { 5755 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5756 host + mem_off + (i << msz)); 5757 } 5758 } 5759 reg_off += 1 << esz; 5760 mem_off += N << msz; 5761 } while (reg_off <= reg_last && (reg_off & 63)); 5762 } 5763 5764 clear_helper_retaddr(); 5765 5766 /* 5767 * Use the slow path to manage the cross-page misalignment. 5768 * But we know this is RAM and cannot trap. 5769 */ 5770 mem_off = info.mem_off_split; 5771 if (unlikely(mem_off >= 0)) { 5772 reg_off = info.reg_off_split; 5773 for (i = 0; i < N; ++i) { 5774 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5775 addr + mem_off + (i << msz), retaddr); 5776 } 5777 } 5778 5779 mem_off = info.mem_off_first[1]; 5780 if (unlikely(mem_off >= 0)) { 5781 reg_off = info.reg_off_first[1]; 5782 reg_last = info.reg_off_last[1]; 5783 host = info.page[1].host; 5784 5785 set_helper_retaddr(retaddr); 5786 5787 do { 5788 uint64_t pg = vg[reg_off >> 6]; 5789 do { 5790 if ((pg >> (reg_off & 63)) & 1) { 5791 for (i = 0; i < N; ++i) { 5792 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5793 host + mem_off + (i << msz)); 5794 } 5795 } 5796 reg_off += 1 << esz; 5797 mem_off += N << msz; 5798 } while (reg_off & 63); 5799 } while (reg_off <= reg_last); 5800 5801 clear_helper_retaddr(); 5802 } 5803 } 5804 5805 static inline QEMU_ALWAYS_INLINE 5806 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 5807 uint32_t desc, const uintptr_t ra, 5808 const int esz, const int msz, const int N, 5809 sve_ldst1_host_fn *host_fn, 5810 sve_ldst1_tlb_fn *tlb_fn) 5811 { 5812 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5813 int bit55 = extract64(addr, 55, 1); 5814 5815 /* Remove mtedesc from the normal sve descriptor. */ 5816 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 5817 5818 /* Perform gross MTE suppression early. */ 5819 if (!tbi_check(mtedesc, bit55) || 5820 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 5821 mtedesc = 0; 5822 } 5823 5824 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 5825 } 5826 5827 #define DO_LD1_1(NAME, ESZ) \ 5828 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 5829 target_ulong addr, uint32_t desc) \ 5830 { \ 5831 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 5832 sve_##NAME##_host, sve_##NAME##_tlb); \ 5833 } \ 5834 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 5835 target_ulong addr, uint32_t desc) \ 5836 { \ 5837 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 5838 sve_##NAME##_host, sve_##NAME##_tlb); \ 5839 } 5840 5841 #define DO_LD1_2(NAME, ESZ, MSZ) \ 5842 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 5843 target_ulong addr, uint32_t desc) \ 5844 { \ 5845 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5846 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5847 } \ 5848 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 5849 target_ulong addr, uint32_t desc) \ 5850 { \ 5851 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 5852 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5853 } \ 5854 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 5855 target_ulong addr, uint32_t desc) \ 5856 { \ 5857 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5858 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 5859 } \ 5860 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 5861 target_ulong addr, uint32_t desc) \ 5862 { \ 5863 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 5864 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 5865 } 5866 5867 DO_LD1_1(ld1bb, MO_8) 5868 DO_LD1_1(ld1bhu, MO_16) 5869 DO_LD1_1(ld1bhs, MO_16) 5870 DO_LD1_1(ld1bsu, MO_32) 5871 DO_LD1_1(ld1bss, MO_32) 5872 DO_LD1_1(ld1bdu, MO_64) 5873 DO_LD1_1(ld1bds, MO_64) 5874 5875 DO_LD1_2(ld1hh, MO_16, MO_16) 5876 DO_LD1_2(ld1hsu, MO_32, MO_16) 5877 DO_LD1_2(ld1hss, MO_32, MO_16) 5878 DO_LD1_2(ld1hdu, MO_64, MO_16) 5879 DO_LD1_2(ld1hds, MO_64, MO_16) 5880 5881 DO_LD1_2(ld1ss, MO_32, MO_32) 5882 DO_LD1_2(ld1sdu, MO_64, MO_32) 5883 DO_LD1_2(ld1sds, MO_64, MO_32) 5884 5885 DO_LD1_2(ld1dd, MO_64, MO_64) 5886 5887 #undef DO_LD1_1 5888 #undef DO_LD1_2 5889 5890 #define DO_LDN_1(N) \ 5891 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 5892 target_ulong addr, uint32_t desc) \ 5893 { \ 5894 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 5895 sve_ld1bb_host, sve_ld1bb_tlb); \ 5896 } \ 5897 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 5898 target_ulong addr, uint32_t desc) \ 5899 { \ 5900 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 5901 sve_ld1bb_host, sve_ld1bb_tlb); \ 5902 } 5903 5904 #define DO_LDN_2(N, SUFF, ESZ) \ 5905 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 5906 target_ulong addr, uint32_t desc) \ 5907 { \ 5908 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5909 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5910 } \ 5911 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 5912 target_ulong addr, uint32_t desc) \ 5913 { \ 5914 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 5915 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5916 } \ 5917 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 5918 target_ulong addr, uint32_t desc) \ 5919 { \ 5920 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5921 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 5922 } \ 5923 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 5924 target_ulong addr, uint32_t desc) \ 5925 { \ 5926 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 5927 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 5928 } 5929 5930 DO_LDN_1(2) 5931 DO_LDN_1(3) 5932 DO_LDN_1(4) 5933 5934 DO_LDN_2(2, hh, MO_16) 5935 DO_LDN_2(3, hh, MO_16) 5936 DO_LDN_2(4, hh, MO_16) 5937 5938 DO_LDN_2(2, ss, MO_32) 5939 DO_LDN_2(3, ss, MO_32) 5940 DO_LDN_2(4, ss, MO_32) 5941 5942 DO_LDN_2(2, dd, MO_64) 5943 DO_LDN_2(3, dd, MO_64) 5944 DO_LDN_2(4, dd, MO_64) 5945 5946 #undef DO_LDN_1 5947 #undef DO_LDN_2 5948 5949 /* 5950 * Load contiguous data, first-fault and no-fault. 5951 * 5952 * For user-only, we control the race between page_check_range and 5953 * another thread's munmap by using set/clear_helper_retaddr. Any 5954 * SEGV that occurs between those markers is assumed to be because 5955 * the guest page vanished. Keep that block as small as possible 5956 * so that unrelated QEMU bugs are not blamed on the guest. 5957 */ 5958 5959 /* Fault on byte I. All bits in FFR from I are cleared. The vector 5960 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 5961 * option, which leaves subsequent data unchanged. 5962 */ 5963 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 5964 { 5965 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 5966 5967 if (i & 63) { 5968 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 5969 i = ROUND_UP(i, 64); 5970 } 5971 for (; i < oprsz; i += 64) { 5972 ffr[i / 64] = 0; 5973 } 5974 } 5975 5976 /* 5977 * Common helper for all contiguous no-fault and first-fault loads. 5978 */ 5979 static inline QEMU_ALWAYS_INLINE 5980 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 5981 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 5982 const int esz, const int msz, const SVEContFault fault, 5983 sve_ldst1_host_fn *host_fn, 5984 sve_ldst1_tlb_fn *tlb_fn) 5985 { 5986 const unsigned rd = simd_data(desc); 5987 void *vd = &env->vfp.zregs[rd]; 5988 const intptr_t reg_max = simd_oprsz(desc); 5989 intptr_t reg_off, mem_off, reg_last; 5990 SVEContLdSt info; 5991 int flags; 5992 void *host; 5993 5994 /* Find the active elements. */ 5995 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 5996 /* The entire predicate was false; no load occurs. */ 5997 memset(vd, 0, reg_max); 5998 return; 5999 } 6000 reg_off = info.reg_off_first[0]; 6001 6002 /* Probe the page(s). */ 6003 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6004 /* Fault on first element. */ 6005 tcg_debug_assert(fault == FAULT_NO); 6006 memset(vd, 0, reg_max); 6007 goto do_fault; 6008 } 6009 6010 mem_off = info.mem_off_first[0]; 6011 flags = info.page[0].flags; 6012 6013 /* 6014 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6015 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6016 */ 6017 if (!info.page[0].tagged) { 6018 mtedesc = 0; 6019 } 6020 6021 if (fault == FAULT_FIRST) { 6022 /* Trapping mte check for the first-fault element. */ 6023 if (mtedesc) { 6024 mte_check(env, mtedesc, addr + mem_off, retaddr); 6025 } 6026 6027 /* 6028 * Special handling of the first active element, 6029 * if it crosses a page boundary or is MMIO. 6030 */ 6031 bool is_split = mem_off == info.mem_off_split; 6032 if (unlikely(flags != 0) || unlikely(is_split)) { 6033 /* 6034 * Use the slow path for cross-page handling. 6035 * Might trap for MMIO or watchpoints. 6036 */ 6037 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6038 6039 /* After any fault, zero the other elements. */ 6040 swap_memzero(vd, reg_off); 6041 reg_off += 1 << esz; 6042 mem_off += 1 << msz; 6043 swap_memzero(vd + reg_off, reg_max - reg_off); 6044 6045 if (is_split) { 6046 goto second_page; 6047 } 6048 } else { 6049 memset(vd, 0, reg_max); 6050 } 6051 } else { 6052 memset(vd, 0, reg_max); 6053 if (unlikely(mem_off == info.mem_off_split)) { 6054 /* The first active element crosses a page boundary. */ 6055 flags |= info.page[1].flags; 6056 if (unlikely(flags & TLB_MMIO)) { 6057 /* Some page is MMIO, see below. */ 6058 goto do_fault; 6059 } 6060 if (unlikely(flags & TLB_WATCHPOINT) && 6061 (cpu_watchpoint_address_matches 6062 (env_cpu(env), addr + mem_off, 1 << msz) 6063 & BP_MEM_READ)) { 6064 /* Watchpoint hit, see below. */ 6065 goto do_fault; 6066 } 6067 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6068 goto do_fault; 6069 } 6070 /* 6071 * Use the slow path for cross-page handling. 6072 * This is RAM, without a watchpoint, and will not trap. 6073 */ 6074 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6075 goto second_page; 6076 } 6077 } 6078 6079 /* 6080 * From this point on, all memory operations are MemSingleNF. 6081 * 6082 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6083 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6084 * 6085 * Unfortuately we do not have access to the memory attributes from the 6086 * PTE to tell Device memory from Normal memory. So we make a mostly 6087 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6088 * This gives the right answer for the common cases of "Normal memory, 6089 * backed by host RAM" and "Device memory, backed by MMIO". 6090 * The architecture allows us to suppress an NF load and return 6091 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6092 * case of "Normal memory, backed by MMIO" is permitted. The case we 6093 * get wrong is "Device memory, backed by host RAM", for which we 6094 * should return (UNKNOWN, FAULT) for but do not. 6095 * 6096 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6097 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6098 * architectural breakpoints the same. 6099 */ 6100 if (unlikely(flags & TLB_MMIO)) { 6101 goto do_fault; 6102 } 6103 6104 reg_last = info.reg_off_last[0]; 6105 host = info.page[0].host; 6106 6107 set_helper_retaddr(retaddr); 6108 6109 do { 6110 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6111 do { 6112 if ((pg >> (reg_off & 63)) & 1) { 6113 if (unlikely(flags & TLB_WATCHPOINT) && 6114 (cpu_watchpoint_address_matches 6115 (env_cpu(env), addr + mem_off, 1 << msz) 6116 & BP_MEM_READ)) { 6117 clear_helper_retaddr(); 6118 goto do_fault; 6119 } 6120 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6121 clear_helper_retaddr(); 6122 goto do_fault; 6123 } 6124 host_fn(vd, reg_off, host + mem_off); 6125 } 6126 reg_off += 1 << esz; 6127 mem_off += 1 << msz; 6128 } while (reg_off <= reg_last && (reg_off & 63)); 6129 } while (reg_off <= reg_last); 6130 6131 clear_helper_retaddr(); 6132 6133 /* 6134 * MemSingleNF is allowed to fail for any reason. We have special 6135 * code above to handle the first element crossing a page boundary. 6136 * As an implementation choice, decline to handle a cross-page element 6137 * in any other position. 6138 */ 6139 reg_off = info.reg_off_split; 6140 if (reg_off >= 0) { 6141 goto do_fault; 6142 } 6143 6144 second_page: 6145 reg_off = info.reg_off_first[1]; 6146 if (likely(reg_off < 0)) { 6147 /* No active elements on the second page. All done. */ 6148 return; 6149 } 6150 6151 /* 6152 * MemSingleNF is allowed to fail for any reason. As an implementation 6153 * choice, decline to handle elements on the second page. This should 6154 * be low frequency as the guest walks through memory -- the next 6155 * iteration of the guest's loop should be aligned on the page boundary, 6156 * and then all following iterations will stay aligned. 6157 */ 6158 6159 do_fault: 6160 record_fault(env, reg_off, reg_max); 6161 } 6162 6163 static inline QEMU_ALWAYS_INLINE 6164 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6165 uint32_t desc, const uintptr_t retaddr, 6166 const int esz, const int msz, const SVEContFault fault, 6167 sve_ldst1_host_fn *host_fn, 6168 sve_ldst1_tlb_fn *tlb_fn) 6169 { 6170 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6171 int bit55 = extract64(addr, 55, 1); 6172 6173 /* Remove mtedesc from the normal sve descriptor. */ 6174 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6175 6176 /* Perform gross MTE suppression early. */ 6177 if (!tbi_check(mtedesc, bit55) || 6178 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6179 mtedesc = 0; 6180 } 6181 6182 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6183 esz, msz, fault, host_fn, tlb_fn); 6184 } 6185 6186 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6187 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6188 target_ulong addr, uint32_t desc) \ 6189 { \ 6190 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6191 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6192 } \ 6193 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6194 target_ulong addr, uint32_t desc) \ 6195 { \ 6196 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6197 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6198 } \ 6199 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6200 target_ulong addr, uint32_t desc) \ 6201 { \ 6202 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6203 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6204 } \ 6205 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6206 target_ulong addr, uint32_t desc) \ 6207 { \ 6208 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6209 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6210 } 6211 6212 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6213 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6214 target_ulong addr, uint32_t desc) \ 6215 { \ 6216 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6217 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6218 } \ 6219 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6220 target_ulong addr, uint32_t desc) \ 6221 { \ 6222 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6223 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6224 } \ 6225 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6226 target_ulong addr, uint32_t desc) \ 6227 { \ 6228 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6229 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6230 } \ 6231 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6232 target_ulong addr, uint32_t desc) \ 6233 { \ 6234 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6235 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6236 } \ 6237 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6238 target_ulong addr, uint32_t desc) \ 6239 { \ 6240 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6241 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6242 } \ 6243 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6244 target_ulong addr, uint32_t desc) \ 6245 { \ 6246 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6247 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6248 } \ 6249 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6250 target_ulong addr, uint32_t desc) \ 6251 { \ 6252 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6253 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6254 } \ 6255 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6256 target_ulong addr, uint32_t desc) \ 6257 { \ 6258 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6259 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6260 } 6261 6262 DO_LDFF1_LDNF1_1(bb, MO_8) 6263 DO_LDFF1_LDNF1_1(bhu, MO_16) 6264 DO_LDFF1_LDNF1_1(bhs, MO_16) 6265 DO_LDFF1_LDNF1_1(bsu, MO_32) 6266 DO_LDFF1_LDNF1_1(bss, MO_32) 6267 DO_LDFF1_LDNF1_1(bdu, MO_64) 6268 DO_LDFF1_LDNF1_1(bds, MO_64) 6269 6270 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6271 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6272 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6273 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6274 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6275 6276 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6277 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6278 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6279 6280 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6281 6282 #undef DO_LDFF1_LDNF1_1 6283 #undef DO_LDFF1_LDNF1_2 6284 6285 /* 6286 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6287 */ 6288 6289 static inline QEMU_ALWAYS_INLINE 6290 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6291 uint32_t desc, const uintptr_t retaddr, 6292 const int esz, const int msz, const int N, uint32_t mtedesc, 6293 sve_ldst1_host_fn *host_fn, 6294 sve_ldst1_tlb_fn *tlb_fn) 6295 { 6296 const unsigned rd = simd_data(desc); 6297 const intptr_t reg_max = simd_oprsz(desc); 6298 intptr_t reg_off, reg_last, mem_off; 6299 SVEContLdSt info; 6300 void *host; 6301 int i, flags; 6302 6303 /* Find the active elements. */ 6304 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6305 /* The entire predicate was false; no store occurs. */ 6306 return; 6307 } 6308 6309 /* Probe the page(s). Exit with exception for any invalid page. */ 6310 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6311 6312 /* Handle watchpoints for all active elements. */ 6313 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6314 BP_MEM_WRITE, retaddr); 6315 6316 /* 6317 * Handle mte checks for all active elements. 6318 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6319 */ 6320 if (mtedesc) { 6321 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6322 mtedesc, retaddr); 6323 } 6324 6325 flags = info.page[0].flags | info.page[1].flags; 6326 if (unlikely(flags != 0)) { 6327 /* 6328 * At least one page includes MMIO. 6329 * Any bus operation can fail with cpu_transaction_failed, 6330 * which for ARM will raise SyncExternal. We cannot avoid 6331 * this fault and will leave with the store incomplete. 6332 */ 6333 mem_off = info.mem_off_first[0]; 6334 reg_off = info.reg_off_first[0]; 6335 reg_last = info.reg_off_last[1]; 6336 if (reg_last < 0) { 6337 reg_last = info.reg_off_split; 6338 if (reg_last < 0) { 6339 reg_last = info.reg_off_last[0]; 6340 } 6341 } 6342 6343 do { 6344 uint64_t pg = vg[reg_off >> 6]; 6345 do { 6346 if ((pg >> (reg_off & 63)) & 1) { 6347 for (i = 0; i < N; ++i) { 6348 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6349 addr + mem_off + (i << msz), retaddr); 6350 } 6351 } 6352 reg_off += 1 << esz; 6353 mem_off += N << msz; 6354 } while (reg_off & 63); 6355 } while (reg_off <= reg_last); 6356 return; 6357 } 6358 6359 mem_off = info.mem_off_first[0]; 6360 reg_off = info.reg_off_first[0]; 6361 reg_last = info.reg_off_last[0]; 6362 host = info.page[0].host; 6363 6364 set_helper_retaddr(retaddr); 6365 6366 while (reg_off <= reg_last) { 6367 uint64_t pg = vg[reg_off >> 6]; 6368 do { 6369 if ((pg >> (reg_off & 63)) & 1) { 6370 for (i = 0; i < N; ++i) { 6371 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6372 host + mem_off + (i << msz)); 6373 } 6374 } 6375 reg_off += 1 << esz; 6376 mem_off += N << msz; 6377 } while (reg_off <= reg_last && (reg_off & 63)); 6378 } 6379 6380 clear_helper_retaddr(); 6381 6382 /* 6383 * Use the slow path to manage the cross-page misalignment. 6384 * But we know this is RAM and cannot trap. 6385 */ 6386 mem_off = info.mem_off_split; 6387 if (unlikely(mem_off >= 0)) { 6388 reg_off = info.reg_off_split; 6389 for (i = 0; i < N; ++i) { 6390 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6391 addr + mem_off + (i << msz), retaddr); 6392 } 6393 } 6394 6395 mem_off = info.mem_off_first[1]; 6396 if (unlikely(mem_off >= 0)) { 6397 reg_off = info.reg_off_first[1]; 6398 reg_last = info.reg_off_last[1]; 6399 host = info.page[1].host; 6400 6401 set_helper_retaddr(retaddr); 6402 6403 do { 6404 uint64_t pg = vg[reg_off >> 6]; 6405 do { 6406 if ((pg >> (reg_off & 63)) & 1) { 6407 for (i = 0; i < N; ++i) { 6408 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6409 host + mem_off + (i << msz)); 6410 } 6411 } 6412 reg_off += 1 << esz; 6413 mem_off += N << msz; 6414 } while (reg_off & 63); 6415 } while (reg_off <= reg_last); 6416 6417 clear_helper_retaddr(); 6418 } 6419 } 6420 6421 static inline QEMU_ALWAYS_INLINE 6422 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6423 uint32_t desc, const uintptr_t ra, 6424 const int esz, const int msz, const int N, 6425 sve_ldst1_host_fn *host_fn, 6426 sve_ldst1_tlb_fn *tlb_fn) 6427 { 6428 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6429 int bit55 = extract64(addr, 55, 1); 6430 6431 /* Remove mtedesc from the normal sve descriptor. */ 6432 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6433 6434 /* Perform gross MTE suppression early. */ 6435 if (!tbi_check(mtedesc, bit55) || 6436 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6437 mtedesc = 0; 6438 } 6439 6440 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6441 } 6442 6443 #define DO_STN_1(N, NAME, ESZ) \ 6444 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6445 target_ulong addr, uint32_t desc) \ 6446 { \ 6447 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6448 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6449 } \ 6450 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6451 target_ulong addr, uint32_t desc) \ 6452 { \ 6453 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6454 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6455 } 6456 6457 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6458 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6459 target_ulong addr, uint32_t desc) \ 6460 { \ 6461 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6462 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6463 } \ 6464 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6465 target_ulong addr, uint32_t desc) \ 6466 { \ 6467 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6468 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6469 } \ 6470 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6471 target_ulong addr, uint32_t desc) \ 6472 { \ 6473 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6474 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6475 } \ 6476 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6477 target_ulong addr, uint32_t desc) \ 6478 { \ 6479 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6480 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6481 } 6482 6483 DO_STN_1(1, bb, MO_8) 6484 DO_STN_1(1, bh, MO_16) 6485 DO_STN_1(1, bs, MO_32) 6486 DO_STN_1(1, bd, MO_64) 6487 DO_STN_1(2, bb, MO_8) 6488 DO_STN_1(3, bb, MO_8) 6489 DO_STN_1(4, bb, MO_8) 6490 6491 DO_STN_2(1, hh, MO_16, MO_16) 6492 DO_STN_2(1, hs, MO_32, MO_16) 6493 DO_STN_2(1, hd, MO_64, MO_16) 6494 DO_STN_2(2, hh, MO_16, MO_16) 6495 DO_STN_2(3, hh, MO_16, MO_16) 6496 DO_STN_2(4, hh, MO_16, MO_16) 6497 6498 DO_STN_2(1, ss, MO_32, MO_32) 6499 DO_STN_2(1, sd, MO_64, MO_32) 6500 DO_STN_2(2, ss, MO_32, MO_32) 6501 DO_STN_2(3, ss, MO_32, MO_32) 6502 DO_STN_2(4, ss, MO_32, MO_32) 6503 6504 DO_STN_2(1, dd, MO_64, MO_64) 6505 DO_STN_2(2, dd, MO_64, MO_64) 6506 DO_STN_2(3, dd, MO_64, MO_64) 6507 DO_STN_2(4, dd, MO_64, MO_64) 6508 6509 #undef DO_STN_1 6510 #undef DO_STN_2 6511 6512 /* 6513 * Loads with a vector index. 6514 */ 6515 6516 /* 6517 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6518 */ 6519 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6520 6521 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6522 { 6523 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6524 } 6525 6526 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6527 { 6528 return *(int32_t *)(reg + H1_4(reg_ofs)); 6529 } 6530 6531 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6532 { 6533 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6534 } 6535 6536 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6537 { 6538 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6539 } 6540 6541 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6542 { 6543 return *(uint64_t *)(reg + reg_ofs); 6544 } 6545 6546 static inline QEMU_ALWAYS_INLINE 6547 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6548 target_ulong base, uint32_t desc, uintptr_t retaddr, 6549 uint32_t mtedesc, int esize, int msize, 6550 zreg_off_fn *off_fn, 6551 sve_ldst1_host_fn *host_fn, 6552 sve_ldst1_tlb_fn *tlb_fn) 6553 { 6554 const int mmu_idx = arm_env_mmu_index(env); 6555 const intptr_t reg_max = simd_oprsz(desc); 6556 const int scale = simd_data(desc); 6557 ARMVectorReg scratch; 6558 intptr_t reg_off; 6559 SVEHostPage info, info2; 6560 6561 memset(&scratch, 0, reg_max); 6562 reg_off = 0; 6563 do { 6564 uint64_t pg = vg[reg_off >> 6]; 6565 do { 6566 if (likely(pg & 1)) { 6567 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6568 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6569 6570 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6571 mmu_idx, retaddr); 6572 6573 if (likely(in_page >= msize)) { 6574 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6575 cpu_check_watchpoint(env_cpu(env), addr, msize, 6576 info.attrs, BP_MEM_READ, retaddr); 6577 } 6578 if (mtedesc && info.tagged) { 6579 mte_check(env, mtedesc, addr, retaddr); 6580 } 6581 if (unlikely(info.flags & TLB_MMIO)) { 6582 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6583 } else { 6584 set_helper_retaddr(retaddr); 6585 host_fn(&scratch, reg_off, info.host); 6586 clear_helper_retaddr(); 6587 } 6588 } else { 6589 /* Element crosses the page boundary. */ 6590 sve_probe_page(&info2, false, env, addr + in_page, 0, 6591 MMU_DATA_LOAD, mmu_idx, retaddr); 6592 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6593 cpu_check_watchpoint(env_cpu(env), addr, 6594 msize, info.attrs, 6595 BP_MEM_READ, retaddr); 6596 } 6597 if (mtedesc && info.tagged) { 6598 mte_check(env, mtedesc, addr, retaddr); 6599 } 6600 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6601 } 6602 } 6603 reg_off += esize; 6604 pg >>= esize; 6605 } while (reg_off & 63); 6606 } while (reg_off < reg_max); 6607 6608 /* Wait until all exceptions have been raised to write back. */ 6609 memcpy(vd, &scratch, reg_max); 6610 } 6611 6612 static inline QEMU_ALWAYS_INLINE 6613 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6614 target_ulong base, uint32_t desc, uintptr_t retaddr, 6615 int esize, int msize, zreg_off_fn *off_fn, 6616 sve_ldst1_host_fn *host_fn, 6617 sve_ldst1_tlb_fn *tlb_fn) 6618 { 6619 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6620 /* Remove mtedesc from the normal sve descriptor. */ 6621 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6622 6623 /* 6624 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6625 * offset base entirely over the address space hole to change the 6626 * pointer tag, or change the bit55 selector. So we could here 6627 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6628 */ 6629 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6630 esize, msize, off_fn, host_fn, tlb_fn); 6631 } 6632 6633 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6634 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6635 void *vm, target_ulong base, uint32_t desc) \ 6636 { \ 6637 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6638 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6639 } \ 6640 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6641 void *vm, target_ulong base, uint32_t desc) \ 6642 { \ 6643 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6644 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6645 } 6646 6647 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6648 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6649 void *vm, target_ulong base, uint32_t desc) \ 6650 { \ 6651 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6652 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6653 } \ 6654 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6655 void *vm, target_ulong base, uint32_t desc) \ 6656 { \ 6657 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6658 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6659 } 6660 6661 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6662 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6663 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6664 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6665 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6666 6667 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6668 DO_LD1_ZPZ_S(bss, zss, MO_8) 6669 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6670 DO_LD1_ZPZ_D(bds, zss, MO_8) 6671 DO_LD1_ZPZ_D(bds, zd, MO_8) 6672 6673 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6674 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6675 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6676 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6677 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6678 6679 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6680 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6681 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6682 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6683 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6684 6685 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6686 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6687 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6688 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6689 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6690 6691 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6692 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6693 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6694 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6695 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6696 6697 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6698 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6699 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6700 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6701 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6702 6703 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6704 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6705 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6706 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6707 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6708 6709 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6710 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6711 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6712 6713 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6714 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6715 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6716 6717 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6718 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6719 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6720 6721 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6722 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6723 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6724 6725 #undef DO_LD1_ZPZ_S 6726 #undef DO_LD1_ZPZ_D 6727 6728 /* First fault loads with a vector index. */ 6729 6730 /* 6731 * Common helpers for all gather first-faulting loads. 6732 */ 6733 6734 static inline QEMU_ALWAYS_INLINE 6735 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6736 target_ulong base, uint32_t desc, uintptr_t retaddr, 6737 uint32_t mtedesc, const int esz, const int msz, 6738 zreg_off_fn *off_fn, 6739 sve_ldst1_host_fn *host_fn, 6740 sve_ldst1_tlb_fn *tlb_fn) 6741 { 6742 const int mmu_idx = arm_env_mmu_index(env); 6743 const intptr_t reg_max = simd_oprsz(desc); 6744 const int scale = simd_data(desc); 6745 const int esize = 1 << esz; 6746 const int msize = 1 << msz; 6747 intptr_t reg_off; 6748 SVEHostPage info; 6749 target_ulong addr, in_page; 6750 ARMVectorReg scratch; 6751 6752 /* Skip to the first true predicate. */ 6753 reg_off = find_next_active(vg, 0, reg_max, esz); 6754 if (unlikely(reg_off >= reg_max)) { 6755 /* The entire predicate was false; no load occurs. */ 6756 memset(vd, 0, reg_max); 6757 return; 6758 } 6759 6760 /* Protect against overlap between vd and vm. */ 6761 if (unlikely(vd == vm)) { 6762 vm = memcpy(&scratch, vm, reg_max); 6763 } 6764 6765 /* 6766 * Probe the first element, allowing faults. 6767 */ 6768 addr = base + (off_fn(vm, reg_off) << scale); 6769 if (mtedesc) { 6770 mte_check(env, mtedesc, addr, retaddr); 6771 } 6772 tlb_fn(env, vd, reg_off, addr, retaddr); 6773 6774 /* After any fault, zero the other elements. */ 6775 swap_memzero(vd, reg_off); 6776 reg_off += esize; 6777 swap_memzero(vd + reg_off, reg_max - reg_off); 6778 6779 /* 6780 * Probe the remaining elements, not allowing faults. 6781 */ 6782 while (reg_off < reg_max) { 6783 uint64_t pg = vg[reg_off >> 6]; 6784 do { 6785 if (likely((pg >> (reg_off & 63)) & 1)) { 6786 addr = base + (off_fn(vm, reg_off) << scale); 6787 in_page = -(addr | TARGET_PAGE_MASK); 6788 6789 if (unlikely(in_page < msize)) { 6790 /* Stop if the element crosses a page boundary. */ 6791 goto fault; 6792 } 6793 6794 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6795 mmu_idx, retaddr); 6796 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6797 goto fault; 6798 } 6799 if (unlikely(info.flags & TLB_WATCHPOINT) && 6800 (cpu_watchpoint_address_matches 6801 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 6802 goto fault; 6803 } 6804 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 6805 goto fault; 6806 } 6807 6808 set_helper_retaddr(retaddr); 6809 host_fn(vd, reg_off, info.host); 6810 clear_helper_retaddr(); 6811 } 6812 reg_off += esize; 6813 } while (reg_off & 63); 6814 } 6815 return; 6816 6817 fault: 6818 record_fault(env, reg_off, reg_max); 6819 } 6820 6821 static inline QEMU_ALWAYS_INLINE 6822 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6823 target_ulong base, uint32_t desc, uintptr_t retaddr, 6824 const int esz, const int msz, 6825 zreg_off_fn *off_fn, 6826 sve_ldst1_host_fn *host_fn, 6827 sve_ldst1_tlb_fn *tlb_fn) 6828 { 6829 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6830 /* Remove mtedesc from the normal sve descriptor. */ 6831 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6832 6833 /* 6834 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6835 * offset base entirely over the address space hole to change the 6836 * pointer tag, or change the bit55 selector. So we could here 6837 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6838 */ 6839 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6840 esz, msz, off_fn, host_fn, tlb_fn); 6841 } 6842 6843 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 6844 void HELPER(sve_ldff##MEM##_##OFS) \ 6845 (CPUARMState *env, void *vd, void *vg, \ 6846 void *vm, target_ulong base, uint32_t desc) \ 6847 { \ 6848 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 6849 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6850 } \ 6851 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6852 (CPUARMState *env, void *vd, void *vg, \ 6853 void *vm, target_ulong base, uint32_t desc) \ 6854 { \ 6855 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 6856 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6857 } 6858 6859 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 6860 void HELPER(sve_ldff##MEM##_##OFS) \ 6861 (CPUARMState *env, void *vd, void *vg, \ 6862 void *vm, target_ulong base, uint32_t desc) \ 6863 { \ 6864 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 6865 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6866 } \ 6867 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 6868 (CPUARMState *env, void *vd, void *vg, \ 6869 void *vm, target_ulong base, uint32_t desc) \ 6870 { \ 6871 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 6872 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6873 } 6874 6875 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 6876 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 6877 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 6878 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 6879 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 6880 6881 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 6882 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 6883 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 6884 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 6885 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 6886 6887 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 6888 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 6889 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 6890 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 6891 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 6892 6893 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 6894 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 6895 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 6896 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 6897 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 6898 6899 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 6900 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 6901 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 6902 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 6903 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 6904 6905 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 6906 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 6907 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 6908 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 6909 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 6910 6911 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 6912 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 6913 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 6914 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 6915 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 6916 6917 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 6918 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 6919 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 6920 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 6921 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 6922 6923 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 6924 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 6925 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 6926 6927 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 6928 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 6929 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 6930 6931 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 6932 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 6933 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 6934 6935 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 6936 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 6937 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 6938 6939 /* Stores with a vector index. */ 6940 6941 static inline QEMU_ALWAYS_INLINE 6942 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6943 target_ulong base, uint32_t desc, uintptr_t retaddr, 6944 uint32_t mtedesc, int esize, int msize, 6945 zreg_off_fn *off_fn, 6946 sve_ldst1_host_fn *host_fn, 6947 sve_ldst1_tlb_fn *tlb_fn) 6948 { 6949 const int mmu_idx = arm_env_mmu_index(env); 6950 const intptr_t reg_max = simd_oprsz(desc); 6951 const int scale = simd_data(desc); 6952 void *host[ARM_MAX_VQ * 4]; 6953 intptr_t reg_off, i; 6954 SVEHostPage info, info2; 6955 6956 /* 6957 * Probe all of the elements for host addresses and flags. 6958 */ 6959 i = reg_off = 0; 6960 do { 6961 uint64_t pg = vg[reg_off >> 6]; 6962 do { 6963 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6964 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6965 6966 host[i] = NULL; 6967 if (likely((pg >> (reg_off & 63)) & 1)) { 6968 if (likely(in_page >= msize)) { 6969 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 6970 mmu_idx, retaddr); 6971 if (!(info.flags & TLB_MMIO)) { 6972 host[i] = info.host; 6973 } 6974 } else { 6975 /* 6976 * Element crosses the page boundary. 6977 * Probe both pages, but do not record the host address, 6978 * so that we use the slow path. 6979 */ 6980 sve_probe_page(&info, false, env, addr, 0, 6981 MMU_DATA_STORE, mmu_idx, retaddr); 6982 sve_probe_page(&info2, false, env, addr + in_page, 0, 6983 MMU_DATA_STORE, mmu_idx, retaddr); 6984 info.flags |= info2.flags; 6985 } 6986 6987 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6988 cpu_check_watchpoint(env_cpu(env), addr, msize, 6989 info.attrs, BP_MEM_WRITE, retaddr); 6990 } 6991 6992 if (mtedesc && info.tagged) { 6993 mte_check(env, mtedesc, addr, retaddr); 6994 } 6995 } 6996 i += 1; 6997 reg_off += esize; 6998 } while (reg_off & 63); 6999 } while (reg_off < reg_max); 7000 7001 /* 7002 * Now that we have recognized all exceptions except SyncExternal 7003 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7004 * 7005 * Note for the common case of an element in RAM, not crossing a page 7006 * boundary, we have stored the host address in host[]. This doubles 7007 * as a first-level check against the predicate, since only enabled 7008 * elements have non-null host addresses. 7009 */ 7010 i = reg_off = 0; 7011 do { 7012 void *h = host[i]; 7013 if (likely(h != NULL)) { 7014 set_helper_retaddr(retaddr); 7015 host_fn(vd, reg_off, h); 7016 clear_helper_retaddr(); 7017 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7018 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7019 tlb_fn(env, vd, reg_off, addr, retaddr); 7020 } 7021 i += 1; 7022 reg_off += esize; 7023 } while (reg_off < reg_max); 7024 } 7025 7026 static inline QEMU_ALWAYS_INLINE 7027 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7028 target_ulong base, uint32_t desc, uintptr_t retaddr, 7029 int esize, int msize, zreg_off_fn *off_fn, 7030 sve_ldst1_host_fn *host_fn, 7031 sve_ldst1_tlb_fn *tlb_fn) 7032 { 7033 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7034 /* Remove mtedesc from the normal sve descriptor. */ 7035 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7036 7037 /* 7038 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7039 * offset base entirely over the address space hole to change the 7040 * pointer tag, or change the bit55 selector. So we could here 7041 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7042 */ 7043 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7044 esize, msize, off_fn, host_fn, tlb_fn); 7045 } 7046 7047 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7048 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7049 void *vm, target_ulong base, uint32_t desc) \ 7050 { \ 7051 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7052 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7053 } \ 7054 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7055 void *vm, target_ulong base, uint32_t desc) \ 7056 { \ 7057 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7058 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7059 } 7060 7061 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7062 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7063 void *vm, target_ulong base, uint32_t desc) \ 7064 { \ 7065 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7066 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7067 } \ 7068 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7069 void *vm, target_ulong base, uint32_t desc) \ 7070 { \ 7071 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7072 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7073 } 7074 7075 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7076 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7077 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7078 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7079 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7080 7081 DO_ST1_ZPZ_S(bs, zss, MO_8) 7082 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7083 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7084 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7085 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7086 7087 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7088 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7089 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7090 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7091 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7092 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7093 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7094 7095 DO_ST1_ZPZ_D(bd, zss, MO_8) 7096 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7097 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7098 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7099 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7100 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7101 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7102 7103 DO_ST1_ZPZ_D(bd, zd, MO_8) 7104 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7105 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7106 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7107 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7108 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7109 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7110 7111 #undef DO_ST1_ZPZ_S 7112 #undef DO_ST1_ZPZ_D 7113 7114 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7115 { 7116 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7117 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7118 7119 for (i = 0; i < opr_sz; ++i) { 7120 d[i] = n[i] ^ m[i] ^ k[i]; 7121 } 7122 } 7123 7124 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7125 { 7126 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7127 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7128 7129 for (i = 0; i < opr_sz; ++i) { 7130 d[i] = n[i] ^ (m[i] & ~k[i]); 7131 } 7132 } 7133 7134 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7135 { 7136 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7137 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7138 7139 for (i = 0; i < opr_sz; ++i) { 7140 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7141 } 7142 } 7143 7144 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7145 { 7146 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7147 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7148 7149 for (i = 0; i < opr_sz; ++i) { 7150 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7151 } 7152 } 7153 7154 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7155 { 7156 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7157 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7158 7159 for (i = 0; i < opr_sz; ++i) { 7160 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7161 } 7162 } 7163 7164 /* 7165 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7166 * See hasless(v,1) from 7167 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7168 */ 7169 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7170 { 7171 int bits = 8 << esz; 7172 uint64_t ones = dup_const(esz, 1); 7173 uint64_t signs = ones << (bits - 1); 7174 uint64_t cmp0, cmp1; 7175 7176 cmp1 = dup_const(esz, n); 7177 cmp0 = cmp1 ^ m0; 7178 cmp1 = cmp1 ^ m1; 7179 cmp0 = (cmp0 - ones) & ~cmp0; 7180 cmp1 = (cmp1 - ones) & ~cmp1; 7181 return (cmp0 | cmp1) & signs; 7182 } 7183 7184 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7185 uint32_t desc, int esz, bool nmatch) 7186 { 7187 uint16_t esz_mask = pred_esz_masks[esz]; 7188 intptr_t opr_sz = simd_oprsz(desc); 7189 uint32_t flags = PREDTEST_INIT; 7190 intptr_t i, j, k; 7191 7192 for (i = 0; i < opr_sz; i += 16) { 7193 uint64_t m0 = *(uint64_t *)(vm + i); 7194 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7195 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7196 uint16_t out = 0; 7197 7198 for (j = 0; j < 16; j += 8) { 7199 uint64_t n = *(uint64_t *)(vn + i + j); 7200 7201 for (k = 0; k < 8; k += 1 << esz) { 7202 if (pg & (1 << (j + k))) { 7203 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7204 out |= (o ^ nmatch) << (j + k); 7205 } 7206 } 7207 } 7208 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7209 flags = iter_predtest_fwd(out, pg, flags); 7210 } 7211 return flags; 7212 } 7213 7214 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7215 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7216 { \ 7217 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7218 } 7219 7220 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7221 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7222 7223 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7224 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7225 7226 #undef DO_PPZZ_MATCH 7227 7228 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7229 uint32_t desc) 7230 { 7231 ARMVectorReg scratch; 7232 intptr_t i, j; 7233 intptr_t opr_sz = simd_oprsz(desc); 7234 uint32_t *d = vd, *n = vn, *m = vm; 7235 uint8_t *pg = vg; 7236 7237 if (d == n) { 7238 n = memcpy(&scratch, n, opr_sz); 7239 if (d == m) { 7240 m = n; 7241 } 7242 } else if (d == m) { 7243 m = memcpy(&scratch, m, opr_sz); 7244 } 7245 7246 for (i = 0; i < opr_sz; i += 4) { 7247 uint64_t count = 0; 7248 uint8_t pred; 7249 7250 pred = pg[H1(i >> 3)] >> (i & 7); 7251 if (pred & 1) { 7252 uint32_t nn = n[H4(i >> 2)]; 7253 7254 for (j = 0; j <= i; j += 4) { 7255 pred = pg[H1(j >> 3)] >> (j & 7); 7256 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7257 ++count; 7258 } 7259 } 7260 } 7261 d[H4(i >> 2)] = count; 7262 } 7263 } 7264 7265 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7266 uint32_t desc) 7267 { 7268 ARMVectorReg scratch; 7269 intptr_t i, j; 7270 intptr_t opr_sz = simd_oprsz(desc); 7271 uint64_t *d = vd, *n = vn, *m = vm; 7272 uint8_t *pg = vg; 7273 7274 if (d == n) { 7275 n = memcpy(&scratch, n, opr_sz); 7276 if (d == m) { 7277 m = n; 7278 } 7279 } else if (d == m) { 7280 m = memcpy(&scratch, m, opr_sz); 7281 } 7282 7283 for (i = 0; i < opr_sz / 8; ++i) { 7284 uint64_t count = 0; 7285 if (pg[H1(i)] & 1) { 7286 uint64_t nn = n[i]; 7287 for (j = 0; j <= i; ++j) { 7288 if ((pg[H1(j)] & 1) && nn == m[j]) { 7289 ++count; 7290 } 7291 } 7292 } 7293 d[i] = count; 7294 } 7295 } 7296 7297 /* 7298 * Returns the number of bytes in m0 and m1 that match n. 7299 * Unlike do_match2 we don't just need true/false, we need an exact count. 7300 * This requires two extra logical operations. 7301 */ 7302 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7303 { 7304 const uint64_t mask = dup_const(MO_8, 0x7f); 7305 uint64_t cmp0, cmp1; 7306 7307 cmp1 = dup_const(MO_8, n); 7308 cmp0 = cmp1 ^ m0; 7309 cmp1 = cmp1 ^ m1; 7310 7311 /* 7312 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7313 * 2: carry in to msb if byte != 0 (+ mask) 7314 * 3: set msb if cmp has msb set (| cmp) 7315 * 4: set ~msb to ignore them (| mask) 7316 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7317 * 5: invert, resulting in 0x80 if and only if byte == 0. 7318 */ 7319 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7320 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7321 7322 /* 7323 * Combine the two compares in a way that the bits do 7324 * not overlap, and so preserves the count of set bits. 7325 * If the host has an efficient instruction for ctpop, 7326 * then ctpop(x) + ctpop(y) has the same number of 7327 * operations as ctpop(x | (y >> 1)). If the host does 7328 * not have an efficient ctpop, then we only want to 7329 * use it once. 7330 */ 7331 return ctpop64(cmp0 | (cmp1 >> 1)); 7332 } 7333 7334 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7335 { 7336 intptr_t i, j; 7337 intptr_t opr_sz = simd_oprsz(desc); 7338 7339 for (i = 0; i < opr_sz; i += 16) { 7340 uint64_t n0 = *(uint64_t *)(vn + i); 7341 uint64_t m0 = *(uint64_t *)(vm + i); 7342 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7343 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7344 uint64_t out0 = 0; 7345 uint64_t out1 = 0; 7346 7347 for (j = 0; j < 64; j += 8) { 7348 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7349 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7350 out0 |= cnt0 << j; 7351 out1 |= cnt1 << j; 7352 } 7353 7354 *(uint64_t *)(vd + i) = out0; 7355 *(uint64_t *)(vd + i + 8) = out1; 7356 } 7357 } 7358 7359 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7360 { 7361 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7362 int shr = simd_data(desc); 7363 int shl = 8 - shr; 7364 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7365 uint64_t *d = vd, *n = vn, *m = vm; 7366 7367 for (i = 0; i < opr_sz; ++i) { 7368 uint64_t t = n[i] ^ m[i]; 7369 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7370 } 7371 } 7372 7373 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7374 { 7375 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7376 int shr = simd_data(desc); 7377 int shl = 16 - shr; 7378 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7379 uint64_t *d = vd, *n = vn, *m = vm; 7380 7381 for (i = 0; i < opr_sz; ++i) { 7382 uint64_t t = n[i] ^ m[i]; 7383 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7384 } 7385 } 7386 7387 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7388 { 7389 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7390 int shr = simd_data(desc); 7391 uint32_t *d = vd, *n = vn, *m = vm; 7392 7393 for (i = 0; i < opr_sz; ++i) { 7394 d[i] = ror32(n[i] ^ m[i], shr); 7395 } 7396 } 7397 7398 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7399 float_status *status, uint32_t desc) 7400 { 7401 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7402 7403 for (s = 0; s < opr_sz; ++s) { 7404 float32 *n = vn + s * sizeof(float32) * 4; 7405 float32 *m = vm + s * sizeof(float32) * 4; 7406 float32 *a = va + s * sizeof(float32) * 4; 7407 float32 *d = vd + s * sizeof(float32) * 4; 7408 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7409 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7410 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7411 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7412 float32 p0, p1; 7413 7414 /* i = 0, j = 0 */ 7415 p0 = float32_mul(n00, m00, status); 7416 p1 = float32_mul(n01, m01, status); 7417 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7418 7419 /* i = 0, j = 1 */ 7420 p0 = float32_mul(n00, m10, status); 7421 p1 = float32_mul(n01, m11, status); 7422 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7423 7424 /* i = 1, j = 0 */ 7425 p0 = float32_mul(n10, m00, status); 7426 p1 = float32_mul(n11, m01, status); 7427 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7428 7429 /* i = 1, j = 1 */ 7430 p0 = float32_mul(n10, m10, status); 7431 p1 = float32_mul(n11, m11, status); 7432 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7433 } 7434 } 7435 7436 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7437 float_status *status, uint32_t desc) 7438 { 7439 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7440 7441 for (s = 0; s < opr_sz; ++s) { 7442 float64 *n = vn + s * sizeof(float64) * 4; 7443 float64 *m = vm + s * sizeof(float64) * 4; 7444 float64 *a = va + s * sizeof(float64) * 4; 7445 float64 *d = vd + s * sizeof(float64) * 4; 7446 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7447 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7448 float64 p0, p1; 7449 7450 /* i = 0, j = 0 */ 7451 p0 = float64_mul(n00, m00, status); 7452 p1 = float64_mul(n01, m01, status); 7453 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7454 7455 /* i = 0, j = 1 */ 7456 p0 = float64_mul(n00, m10, status); 7457 p1 = float64_mul(n01, m11, status); 7458 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7459 7460 /* i = 1, j = 0 */ 7461 p0 = float64_mul(n10, m00, status); 7462 p1 = float64_mul(n11, m01, status); 7463 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7464 7465 /* i = 1, j = 1 */ 7466 p0 = float64_mul(n10, m10, status); 7467 p1 = float64_mul(n11, m11, status); 7468 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7469 } 7470 } 7471 7472 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7473 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7474 float_status *status, uint32_t desc) \ 7475 { \ 7476 intptr_t i = simd_oprsz(desc); \ 7477 uint64_t *g = vg; \ 7478 do { \ 7479 uint64_t pg = g[(i - 1) >> 6]; \ 7480 do { \ 7481 i -= sizeof(TYPEW); \ 7482 if (likely((pg >> (i & 63)) & 1)) { \ 7483 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7484 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7485 } \ 7486 } while (i & 63); \ 7487 } while (i != 0); \ 7488 } 7489 7490 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7491 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7492 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7493 7494 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7495 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7496 float_status *status, uint32_t desc) \ 7497 { \ 7498 intptr_t i = simd_oprsz(desc); \ 7499 uint64_t *g = vg; \ 7500 do { \ 7501 uint64_t pg = g[(i - 1) >> 6]; \ 7502 do { \ 7503 i -= sizeof(TYPEW); \ 7504 if (likely((pg >> (i & 63)) & 1)) { \ 7505 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7506 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7507 } \ 7508 } while (i & 63); \ 7509 } while (i != 0); \ 7510 } 7511 7512 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7513 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7514 7515 #undef DO_FCVTLT 7516 #undef DO_FCVTNT 7517