1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/page-protection.h" 25 #include "exec/helper-proto.h" 26 #include "exec/target_page.h" 27 #include "exec/tlb-flags.h" 28 #include "tcg/tcg-gvec-desc.h" 29 #include "fpu/softfloat.h" 30 #include "tcg/tcg.h" 31 #include "vec_internal.h" 32 #include "sve_ldst_internal.h" 33 #include "accel/tcg/cpu-ldst.h" 34 #include "accel/tcg/cpu-ops.h" 35 #ifdef CONFIG_USER_ONLY 36 #include "user/page-protection.h" 37 #endif 38 39 40 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 41 * 42 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 43 * and bit 0 set if C is set. Compare the definitions of these variables 44 * within CPUARMState. 45 */ 46 47 /* For no G bits set, NZCV = C. */ 48 #define PREDTEST_INIT 1 49 50 /* This is an iterative function, called for each Pd and Pg word 51 * moving forward. 52 */ 53 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 54 { 55 if (likely(g)) { 56 /* Compute N from first D & G. 57 Use bit 2 to signal first G bit seen. */ 58 if (!(flags & 4)) { 59 flags |= ((d & (g & -g)) != 0) << 31; 60 flags |= 4; 61 } 62 63 /* Accumulate Z from each D & G. */ 64 flags |= ((d & g) != 0) << 1; 65 66 /* Compute C from last !(D & G). Replace previous. */ 67 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 68 } 69 return flags; 70 } 71 72 /* This is an iterative function, called for each Pd and Pg word 73 * moving backward. 74 */ 75 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 76 { 77 if (likely(g)) { 78 /* Compute C from first (i.e last) !(D & G). 79 Use bit 2 to signal first G bit seen. */ 80 if (!(flags & 4)) { 81 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 82 flags |= (d & pow2floor(g)) == 0; 83 } 84 85 /* Accumulate Z from each D & G. */ 86 flags |= ((d & g) != 0) << 1; 87 88 /* Compute N from last (i.e first) D & G. Replace previous. */ 89 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 90 } 91 return flags; 92 } 93 94 /* The same for a single word predicate. */ 95 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 96 { 97 return iter_predtest_fwd(d, g, PREDTEST_INIT); 98 } 99 100 /* The same for a multi-word predicate. */ 101 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 102 { 103 uint32_t flags = PREDTEST_INIT; 104 uint64_t *d = vd, *g = vg; 105 uintptr_t i = 0; 106 107 do { 108 flags = iter_predtest_fwd(d[i], g[i], flags); 109 } while (++i < words); 110 111 return flags; 112 } 113 114 /* Similarly for single word elements. */ 115 static inline uint64_t expand_pred_s(uint8_t byte) 116 { 117 static const uint64_t word[] = { 118 [0x01] = 0x00000000ffffffffull, 119 [0x10] = 0xffffffff00000000ull, 120 [0x11] = 0xffffffffffffffffull, 121 }; 122 return word[byte & 0x11]; 123 } 124 125 #define LOGICAL_PPPP(NAME, FUNC) \ 126 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 127 { \ 128 uintptr_t opr_sz = simd_oprsz(desc); \ 129 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 130 uintptr_t i; \ 131 for (i = 0; i < opr_sz / 8; ++i) { \ 132 d[i] = FUNC(n[i], m[i], g[i]); \ 133 } \ 134 } 135 136 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 137 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 138 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 139 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 140 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 141 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 142 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 143 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 144 145 LOGICAL_PPPP(sve_and_pppp, DO_AND) 146 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 147 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 148 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 149 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 150 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 151 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 152 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 153 154 #undef DO_AND 155 #undef DO_BIC 156 #undef DO_EOR 157 #undef DO_ORR 158 #undef DO_ORN 159 #undef DO_NOR 160 #undef DO_NAND 161 #undef DO_SEL 162 #undef LOGICAL_PPPP 163 164 /* Fully general three-operand expander, controlled by a predicate. 165 * This is complicated by the host-endian storage of the register file. 166 */ 167 /* ??? I don't expect the compiler could ever vectorize this itself. 168 * With some tables we can convert bit masks to byte masks, and with 169 * extra care wrt byte/word ordering we could use gcc generic vectors 170 * and do 16 bytes at a time. 171 */ 172 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 173 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 174 { \ 175 intptr_t i, opr_sz = simd_oprsz(desc); \ 176 for (i = 0; i < opr_sz; ) { \ 177 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 178 do { \ 179 if (pg & 1) { \ 180 TYPE nn = *(TYPE *)(vn + H(i)); \ 181 TYPE mm = *(TYPE *)(vm + H(i)); \ 182 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 183 } \ 184 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 185 } while (i & 15); \ 186 } \ 187 } 188 189 /* Similarly, specialized for 64-bit operands. */ 190 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 191 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 192 { \ 193 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 194 TYPE *d = vd, *n = vn, *m = vm; \ 195 uint8_t *pg = vg; \ 196 for (i = 0; i < opr_sz; i += 1) { \ 197 if (pg[H1(i)] & 1) { \ 198 TYPE nn = n[i], mm = m[i]; \ 199 d[i] = OP(nn, mm); \ 200 } \ 201 } \ 202 } 203 204 #define DO_AND(N, M) (N & M) 205 #define DO_EOR(N, M) (N ^ M) 206 #define DO_ORR(N, M) (N | M) 207 #define DO_BIC(N, M) (N & ~M) 208 #define DO_ADD(N, M) (N + M) 209 #define DO_SUB(N, M) (N - M) 210 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 211 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 212 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 213 #define DO_MUL(N, M) (N * M) 214 215 216 /* 217 * We must avoid the C undefined behaviour cases: division by 218 * zero and signed division of INT_MIN by -1. Both of these 219 * have architecturally defined required results for Arm. 220 * We special case all signed divisions by -1 to avoid having 221 * to deduce the minimum integer for the type involved. 222 */ 223 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 224 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 225 226 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 227 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 228 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 229 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 230 231 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 232 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 233 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 234 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 235 236 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 237 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 238 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 239 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 240 241 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 242 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 243 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 244 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 245 246 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 247 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 248 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 249 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 250 251 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 252 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 253 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 254 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 255 256 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 257 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 258 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 259 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 260 261 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 262 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 263 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 264 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 265 266 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 267 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 268 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 269 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 270 271 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 272 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 273 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 274 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 275 276 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 277 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 278 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 279 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 280 281 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 282 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 283 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 284 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 285 286 /* Because the computation type is at least twice as large as required, 287 these work for both signed and unsigned source types. */ 288 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 289 { 290 return (n * m) >> 8; 291 } 292 293 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 294 { 295 return (n * m) >> 16; 296 } 297 298 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 299 { 300 return (n * m) >> 32; 301 } 302 303 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 304 { 305 uint64_t lo, hi; 306 muls64(&lo, &hi, n, m); 307 return hi; 308 } 309 310 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 311 { 312 uint64_t lo, hi; 313 mulu64(&lo, &hi, n, m); 314 return hi; 315 } 316 317 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 318 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 319 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 320 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 321 322 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 323 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 324 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 325 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 326 327 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 328 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 329 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 330 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 331 332 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 333 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 334 335 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 336 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 337 338 /* Note that all bits of the shift are significant 339 and not modulo the element size. */ 340 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 341 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 342 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 343 344 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 345 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 346 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 347 348 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 349 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 350 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 351 352 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 353 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 354 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 355 356 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 357 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 358 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 359 360 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 361 { 362 int8_t n1 = n, n2 = n >> 8; 363 return m + n1 + n2; 364 } 365 366 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 367 { 368 int16_t n1 = n, n2 = n >> 16; 369 return m + n1 + n2; 370 } 371 372 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 373 { 374 int32_t n1 = n, n2 = n >> 32; 375 return m + n1 + n2; 376 } 377 378 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 379 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 380 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 381 382 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 383 { 384 uint8_t n1 = n, n2 = n >> 8; 385 return m + n1 + n2; 386 } 387 388 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 389 { 390 uint16_t n1 = n, n2 = n >> 16; 391 return m + n1 + n2; 392 } 393 394 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 395 { 396 uint32_t n1 = n, n2 = n >> 32; 397 return m + n1 + n2; 398 } 399 400 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 401 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 402 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 403 404 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 405 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 406 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 407 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 408 409 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 410 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 411 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 412 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 413 414 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 415 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 416 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 417 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 418 419 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 420 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 421 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 422 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 423 424 /* 425 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 426 * We pass in a pointer to a dummy saturation field to trigger 427 * the saturating arithmetic but discard the information about 428 * whether it has occurred. 429 */ 430 #define do_sqshl_b(n, m) \ 431 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 432 #define do_sqshl_h(n, m) \ 433 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 434 #define do_sqshl_s(n, m) \ 435 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 436 #define do_sqshl_d(n, m) \ 437 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 438 439 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 440 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 441 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 442 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 443 444 #define do_uqshl_b(n, m) \ 445 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 446 #define do_uqshl_h(n, m) \ 447 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 448 #define do_uqshl_s(n, m) \ 449 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 450 #define do_uqshl_d(n, m) \ 451 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 452 453 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 454 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 455 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 456 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 457 458 #define do_sqrshl_b(n, m) \ 459 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 460 #define do_sqrshl_h(n, m) \ 461 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 462 #define do_sqrshl_s(n, m) \ 463 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 464 #define do_sqrshl_d(n, m) \ 465 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 466 467 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 468 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 469 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 470 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 471 472 #undef do_sqrshl_d 473 474 #define do_uqrshl_b(n, m) \ 475 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 476 #define do_uqrshl_h(n, m) \ 477 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 478 #define do_uqrshl_s(n, m) \ 479 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 480 #define do_uqrshl_d(n, m) \ 481 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 482 483 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 484 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 485 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 486 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 487 488 #undef do_uqrshl_d 489 490 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 491 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 492 493 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 494 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 495 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 496 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 497 498 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 499 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 500 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 501 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 502 503 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 504 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 505 506 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 507 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 508 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 509 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 510 511 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 512 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 513 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 514 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 515 516 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 517 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 518 519 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 520 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 521 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 522 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 523 524 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 525 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 526 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 527 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 528 529 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 530 { 531 return val >= max ? max : val <= min ? min : val; 532 } 533 534 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 535 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 536 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 537 538 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 539 { 540 int64_t r = n + m; 541 if (((r ^ n) & ~(n ^ m)) < 0) { 542 /* Signed overflow. */ 543 return r < 0 ? INT64_MAX : INT64_MIN; 544 } 545 return r; 546 } 547 548 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 549 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 550 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 551 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 552 553 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 554 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 555 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 556 557 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 558 { 559 uint64_t r = n + m; 560 return r < n ? UINT64_MAX : r; 561 } 562 563 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 564 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 565 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 566 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 567 568 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 569 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 570 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 571 572 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 573 { 574 int64_t r = n - m; 575 if (((r ^ n) & (n ^ m)) < 0) { 576 /* Signed overflow. */ 577 return r < 0 ? INT64_MAX : INT64_MIN; 578 } 579 return r; 580 } 581 582 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 583 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 584 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 585 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 586 587 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 588 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 589 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 590 591 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 592 { 593 return n > m ? n - m : 0; 594 } 595 596 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 597 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 598 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 599 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 600 601 #define DO_SUQADD_B(n, m) \ 602 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 603 #define DO_SUQADD_H(n, m) \ 604 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 605 #define DO_SUQADD_S(n, m) \ 606 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 607 608 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 609 { 610 uint64_t r = n + m; 611 612 if (n < 0) { 613 /* Note that m - abs(n) cannot underflow. */ 614 if (r > INT64_MAX) { 615 /* Result is either very large positive or negative. */ 616 if (m > -n) { 617 /* m > abs(n), so r is a very large positive. */ 618 return INT64_MAX; 619 } 620 /* Result is negative. */ 621 } 622 } else { 623 /* Both inputs are positive: check for overflow. */ 624 if (r < m || r > INT64_MAX) { 625 return INT64_MAX; 626 } 627 } 628 return r; 629 } 630 631 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 632 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 633 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 634 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 635 636 #define DO_USQADD_B(n, m) \ 637 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 638 #define DO_USQADD_H(n, m) \ 639 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 640 #define DO_USQADD_S(n, m) \ 641 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 642 643 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 644 { 645 uint64_t r = n + m; 646 647 if (m < 0) { 648 return n < -m ? 0 : r; 649 } 650 return r < n ? UINT64_MAX : r; 651 } 652 653 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 654 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 655 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 656 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 657 658 #undef DO_ZPZZ 659 #undef DO_ZPZZ_D 660 661 /* 662 * Three operand expander, operating on element pairs. 663 * If the slot I is even, the elements from from VN {I, I+1}. 664 * If the slot I is odd, the elements from from VM {I-1, I}. 665 * Load all of the input elements in each pair before overwriting output. 666 */ 667 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 669 { \ 670 intptr_t i, opr_sz = simd_oprsz(desc); \ 671 for (i = 0; i < opr_sz; ) { \ 672 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 673 do { \ 674 TYPE n0 = *(TYPE *)(vn + H(i)); \ 675 TYPE m0 = *(TYPE *)(vm + H(i)); \ 676 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 677 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 678 if (pg & 1) { \ 679 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 680 } \ 681 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 682 if (pg & 1) { \ 683 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 684 } \ 685 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 686 } while (i & 15); \ 687 } \ 688 } 689 690 /* Similarly, specialized for 64-bit operands. */ 691 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 692 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 693 { \ 694 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 695 TYPE *d = vd, *n = vn, *m = vm; \ 696 uint8_t *pg = vg; \ 697 for (i = 0; i < opr_sz; i += 2) { \ 698 TYPE n0 = n[i], n1 = n[i + 1]; \ 699 TYPE m0 = m[i], m1 = m[i + 1]; \ 700 if (pg[H1(i)] & 1) { \ 701 d[i] = OP(n0, n1); \ 702 } \ 703 if (pg[H1(i + 1)] & 1) { \ 704 d[i + 1] = OP(m0, m1); \ 705 } \ 706 } \ 707 } 708 709 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 710 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 711 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 712 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 713 714 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 715 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 716 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 717 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 718 719 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 720 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 721 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 722 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 723 724 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 725 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 726 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 727 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 728 729 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 730 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 731 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 732 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 733 734 #undef DO_ZPZZ_PAIR 735 #undef DO_ZPZZ_PAIR_D 736 737 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 738 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 739 float_status *status, uint32_t desc) \ 740 { \ 741 intptr_t i, opr_sz = simd_oprsz(desc); \ 742 for (i = 0; i < opr_sz; ) { \ 743 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 744 do { \ 745 TYPE n0 = *(TYPE *)(vn + H(i)); \ 746 TYPE m0 = *(TYPE *)(vm + H(i)); \ 747 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 748 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 749 if (pg & 1) { \ 750 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 751 } \ 752 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 753 if (pg & 1) { \ 754 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 755 } \ 756 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 757 } while (i & 15); \ 758 } \ 759 } 760 761 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 762 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 763 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 764 765 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 766 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 767 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 768 769 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 770 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 771 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 772 773 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 774 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 775 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 776 777 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 778 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 779 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 780 781 #undef DO_ZPZZ_PAIR_FP 782 783 /* Three-operand expander, controlled by a predicate, in which the 784 * third operand is "wide". That is, for D = N op M, the same 64-bit 785 * value of M is used with all of the narrower values of N. 786 */ 787 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 788 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 789 { \ 790 intptr_t i, opr_sz = simd_oprsz(desc); \ 791 for (i = 0; i < opr_sz; ) { \ 792 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 793 TYPEW mm = *(TYPEW *)(vm + i); \ 794 do { \ 795 if (pg & 1) { \ 796 TYPE nn = *(TYPE *)(vn + H(i)); \ 797 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 798 } \ 799 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 800 } while (i & 7); \ 801 } \ 802 } 803 804 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 805 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 806 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 807 808 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 809 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 810 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 811 812 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 813 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 814 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 815 816 #undef DO_ZPZW 817 818 /* Fully general two-operand expander, controlled by a predicate. 819 */ 820 #define DO_ZPZ(NAME, TYPE, H, OP) \ 821 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 822 { \ 823 intptr_t i, opr_sz = simd_oprsz(desc); \ 824 for (i = 0; i < opr_sz; ) { \ 825 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 826 do { \ 827 if (pg & 1) { \ 828 TYPE nn = *(TYPE *)(vn + H(i)); \ 829 *(TYPE *)(vd + H(i)) = OP(nn); \ 830 } \ 831 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 832 } while (i & 15); \ 833 } \ 834 } 835 836 /* Similarly, specialized for 64-bit operands. */ 837 #define DO_ZPZ_D(NAME, TYPE, OP) \ 838 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 839 { \ 840 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 841 TYPE *d = vd, *n = vn; \ 842 uint8_t *pg = vg; \ 843 for (i = 0; i < opr_sz; i += 1) { \ 844 if (pg[H1(i)] & 1) { \ 845 TYPE nn = n[i]; \ 846 d[i] = OP(nn); \ 847 } \ 848 } \ 849 } 850 851 #define DO_CLS_B(N) (clrsb32(N) - 24) 852 #define DO_CLS_H(N) (clrsb32(N) - 16) 853 854 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 855 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 856 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 857 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 858 859 #define DO_CLZ_B(N) (clz32(N) - 24) 860 #define DO_CLZ_H(N) (clz32(N) - 16) 861 862 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 863 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 864 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 865 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 866 867 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 868 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 869 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 870 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 871 872 #define DO_CNOT(N) (N == 0) 873 874 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 875 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 876 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 877 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 878 879 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 880 881 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 882 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 883 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 884 885 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N)) 886 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N)) 887 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N)) 888 889 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H) 890 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S) 891 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D) 892 893 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 894 895 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 896 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 897 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 898 899 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N)) 900 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N)) 901 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N)) 902 903 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H) 904 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S) 905 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D) 906 907 #define DO_NOT(N) (~N) 908 909 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 910 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 911 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 912 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 913 914 #define DO_SXTB(N) ((int8_t)N) 915 #define DO_SXTH(N) ((int16_t)N) 916 #define DO_SXTS(N) ((int32_t)N) 917 #define DO_UXTB(N) ((uint8_t)N) 918 #define DO_UXTH(N) ((uint16_t)N) 919 #define DO_UXTS(N) ((uint32_t)N) 920 921 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 922 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 923 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 924 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 925 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 926 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 927 928 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 929 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 930 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 931 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 932 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 933 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 934 935 #define DO_ABS(N) (N < 0 ? -N : N) 936 937 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 938 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 939 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 940 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 941 942 #define DO_NEG(N) (-N) 943 944 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 945 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 946 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 947 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 948 949 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 950 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 951 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 952 953 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 954 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 955 956 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 957 958 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 959 { 960 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 961 uint64_t *d = vd, *n = vn; 962 uint8_t *pg = vg; 963 964 for (i = 0; i < opr_sz; i += 2) { 965 if (pg[H1(i)] & 1) { 966 uint64_t n0 = n[i + 0]; 967 uint64_t n1 = n[i + 1]; 968 d[i + 0] = n1; 969 d[i + 1] = n0; 970 } 971 } 972 } 973 974 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 975 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 976 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 977 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 978 979 #define DO_SQABS(X) \ 980 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 981 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 982 983 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 984 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 985 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 986 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 987 988 #define DO_SQNEG(X) \ 989 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 990 x_ == min_ ? -min_ - 1 : -x_; }) 991 992 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 993 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 994 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 995 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 996 997 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 998 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 999 1000 /* Three-operand expander, unpredicated, in which the third operand is "wide". 1001 */ 1002 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 1003 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1004 { \ 1005 intptr_t i, opr_sz = simd_oprsz(desc); \ 1006 for (i = 0; i < opr_sz; ) { \ 1007 TYPEW mm = *(TYPEW *)(vm + i); \ 1008 do { \ 1009 TYPE nn = *(TYPE *)(vn + H(i)); \ 1010 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 1011 i += sizeof(TYPE); \ 1012 } while (i & 7); \ 1013 } \ 1014 } 1015 1016 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 1017 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 1018 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1019 1020 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1021 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1022 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1023 1024 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1025 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1026 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1027 1028 #undef DO_ZZW 1029 1030 #undef DO_CLS_B 1031 #undef DO_CLS_H 1032 #undef DO_CLZ_B 1033 #undef DO_CLZ_H 1034 #undef DO_CNOT 1035 #undef DO_FABS 1036 #undef DO_FNEG 1037 #undef DO_ABS 1038 #undef DO_NEG 1039 #undef DO_ZPZ 1040 #undef DO_ZPZ_D 1041 1042 /* 1043 * Three-operand expander, unpredicated, in which the two inputs are 1044 * selected from the top or bottom half of the wide column. 1045 */ 1046 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1047 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1048 { \ 1049 intptr_t i, opr_sz = simd_oprsz(desc); \ 1050 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1051 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1052 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1053 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1054 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1055 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1056 } \ 1057 } 1058 1059 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1060 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1061 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1062 1063 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1064 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1065 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1066 1067 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1068 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1069 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1070 1071 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1072 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1073 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1074 1075 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1076 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1077 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1078 1079 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1080 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1081 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1082 1083 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1084 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1085 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1086 1087 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1088 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1089 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1090 1091 /* Note that the multiply cannot overflow, but the doubling can. */ 1092 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1093 { 1094 int16_t val = n * m; 1095 return DO_SQADD_H(val, val); 1096 } 1097 1098 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1099 { 1100 int32_t val = n * m; 1101 return DO_SQADD_S(val, val); 1102 } 1103 1104 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1105 { 1106 int64_t val = n * m; 1107 return do_sqadd_d(val, val); 1108 } 1109 1110 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1111 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1112 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1113 1114 #undef DO_ZZZ_TB 1115 1116 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1117 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1118 { \ 1119 intptr_t i, opr_sz = simd_oprsz(desc); \ 1120 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1121 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1122 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1123 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1124 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1125 } \ 1126 } 1127 1128 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1129 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1130 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1131 1132 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1133 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1134 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1135 1136 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1137 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1138 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1139 1140 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1141 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1142 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1143 1144 #undef DO_ZZZ_WTB 1145 1146 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1147 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1148 { \ 1149 intptr_t i, opr_sz = simd_oprsz(desc); \ 1150 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1151 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1152 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1153 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1154 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1155 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1156 } \ 1157 } 1158 1159 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1160 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1161 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1162 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1163 1164 #undef DO_ZZZ_NTB 1165 1166 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1167 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1168 { \ 1169 intptr_t i, opr_sz = simd_oprsz(desc); \ 1170 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1171 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1172 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1173 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1174 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1175 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1176 } \ 1177 } 1178 1179 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1180 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1181 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1182 1183 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1184 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1185 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1186 1187 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1188 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1189 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1190 1191 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1192 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1193 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1194 1195 #define DO_NMUL(N, M) -(N * M) 1196 1197 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1198 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1199 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1200 1201 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1202 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1203 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1204 1205 #undef DO_ZZZW_ACC 1206 1207 #define DO_XTNB(NAME, TYPE, OP) \ 1208 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1209 { \ 1210 intptr_t i, opr_sz = simd_oprsz(desc); \ 1211 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1212 TYPE nn = *(TYPE *)(vn + i); \ 1213 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1214 *(TYPE *)(vd + i) = nn; \ 1215 } \ 1216 } 1217 1218 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1219 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1220 { \ 1221 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1222 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1223 TYPE nn = *(TYPE *)(vn + i); \ 1224 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1225 } \ 1226 } 1227 1228 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1229 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1230 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1231 1232 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1233 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1234 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1235 1236 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1237 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1238 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1239 1240 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1241 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1242 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1243 1244 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1245 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1246 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1247 1248 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1249 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1250 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1251 1252 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1253 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1254 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1255 1256 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1257 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1258 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1259 1260 #undef DO_XTNB 1261 #undef DO_XTNT 1262 1263 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1264 { 1265 intptr_t i, opr_sz = simd_oprsz(desc); 1266 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1267 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1268 uint32_t *a = va, *n = vn; 1269 uint64_t *d = vd, *m = vm; 1270 1271 for (i = 0; i < opr_sz / 8; ++i) { 1272 uint32_t e1 = a[2 * i + H4(0)]; 1273 uint32_t e2 = n[2 * i + sel] ^ inv; 1274 uint64_t c = extract64(m[i], 32, 1); 1275 /* Compute and store the entire 33-bit result at once. */ 1276 d[i] = c + e1 + e2; 1277 } 1278 } 1279 1280 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1281 { 1282 intptr_t i, opr_sz = simd_oprsz(desc); 1283 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1284 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1285 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1286 1287 for (i = 0; i < opr_sz / 8; i += 2) { 1288 Int128 e1 = int128_make64(a[i]); 1289 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1290 Int128 c = int128_make64(m[i + 1] & 1); 1291 Int128 r = int128_add(int128_add(e1, e2), c); 1292 d[i + 0] = int128_getlo(r); 1293 d[i + 1] = int128_gethi(r); 1294 } 1295 } 1296 1297 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1298 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1299 { \ 1300 intptr_t i, opr_sz = simd_oprsz(desc); \ 1301 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1302 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1303 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1304 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1305 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1306 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1307 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1308 } \ 1309 } 1310 1311 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1312 do_sqdmull_h, DO_SQADD_H) 1313 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1314 do_sqdmull_s, DO_SQADD_S) 1315 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1316 do_sqdmull_d, do_sqadd_d) 1317 1318 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1319 do_sqdmull_h, DO_SQSUB_H) 1320 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1321 do_sqdmull_s, DO_SQSUB_S) 1322 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1323 do_sqdmull_d, do_sqsub_d) 1324 1325 #undef DO_SQDMLAL 1326 1327 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1328 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1329 { \ 1330 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1331 int rot = simd_data(desc); \ 1332 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1333 bool sub_r = rot == 1 || rot == 2; \ 1334 bool sub_i = rot >= 2; \ 1335 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1336 for (i = 0; i < opr_sz; i += 2) { \ 1337 TYPE elt1_a = n[H(i + sel_a)]; \ 1338 TYPE elt2_a = m[H(i + sel_a)]; \ 1339 TYPE elt2_b = m[H(i + sel_b)]; \ 1340 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1341 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1342 } \ 1343 } 1344 1345 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1346 1347 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1348 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1349 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1350 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1351 1352 #define DO_SQRDMLAH_B(N, M, A, S) \ 1353 do_sqrdmlah_b(N, M, A, S, true) 1354 #define DO_SQRDMLAH_H(N, M, A, S) \ 1355 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1356 #define DO_SQRDMLAH_S(N, M, A, S) \ 1357 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1358 #define DO_SQRDMLAH_D(N, M, A, S) \ 1359 do_sqrdmlah_d(N, M, A, S, true) 1360 1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1363 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1364 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1365 1366 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1367 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1368 { \ 1369 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1370 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1371 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1372 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1373 bool sub_r = rot == 1 || rot == 2; \ 1374 bool sub_i = rot >= 2; \ 1375 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1376 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1377 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1378 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1379 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1380 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1381 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1382 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1383 } \ 1384 } \ 1385 } 1386 1387 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1388 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1389 1390 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1391 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1392 1393 #undef DO_CMLA 1394 #undef DO_CMLA_FUNC 1395 #undef DO_CMLA_IDX_FUNC 1396 #undef DO_SQRDMLAH_B 1397 #undef DO_SQRDMLAH_H 1398 #undef DO_SQRDMLAH_S 1399 #undef DO_SQRDMLAH_D 1400 1401 /* Note N and M are 4 elements bundled into one unit. */ 1402 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1403 int sel_a, int sel_b, int sub_i) 1404 { 1405 for (int i = 0; i <= 1; i++) { 1406 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1407 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1408 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1409 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1410 1411 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1412 } 1413 return a; 1414 } 1415 1416 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1417 int sel_a, int sel_b, int sub_i) 1418 { 1419 for (int i = 0; i <= 1; i++) { 1420 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1421 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1422 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1423 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1424 1425 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1426 } 1427 return a; 1428 } 1429 1430 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1431 void *va, uint32_t desc) 1432 { 1433 int opr_sz = simd_oprsz(desc); 1434 int rot = simd_data(desc); 1435 int sel_a = rot & 1; 1436 int sel_b = sel_a ^ 1; 1437 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1438 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1439 1440 for (int e = 0; e < opr_sz / 4; e++) { 1441 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1442 } 1443 } 1444 1445 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1446 void *va, uint32_t desc) 1447 { 1448 int opr_sz = simd_oprsz(desc); 1449 int rot = simd_data(desc); 1450 int sel_a = rot & 1; 1451 int sel_b = sel_a ^ 1; 1452 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1453 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1454 1455 for (int e = 0; e < opr_sz / 8; e++) { 1456 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1457 } 1458 } 1459 1460 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1461 void *va, uint32_t desc) 1462 { 1463 int opr_sz = simd_oprsz(desc); 1464 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1465 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1466 int sel_a = rot & 1; 1467 int sel_b = sel_a ^ 1; 1468 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1469 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1470 1471 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1472 uint32_t seg_m = m[seg + idx]; 1473 for (int e = 0; e < 4; e++) { 1474 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1475 sel_a, sel_b, sub_i); 1476 } 1477 } 1478 } 1479 1480 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1481 void *va, uint32_t desc) 1482 { 1483 int seg, opr_sz = simd_oprsz(desc); 1484 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1485 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1486 int sel_a = rot & 1; 1487 int sel_b = sel_a ^ 1; 1488 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1489 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1490 1491 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1492 uint64_t seg_m = m[seg + idx]; 1493 for (int e = 0; e < 2; e++) { 1494 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1495 sel_a, sel_b, sub_i); 1496 } 1497 } 1498 } 1499 1500 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1501 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1502 { \ 1503 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1504 intptr_t i, j, idx = simd_data(desc); \ 1505 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1506 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1507 TYPE mm = m[i]; \ 1508 for (j = 0; j < segment; j++) { \ 1509 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1510 } \ 1511 } \ 1512 } 1513 1514 #define DO_SQRDMLAH_H(N, M, A) \ 1515 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1516 #define DO_SQRDMLAH_S(N, M, A) \ 1517 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1518 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1519 1520 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1521 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1522 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1523 1524 #define DO_SQRDMLSH_H(N, M, A) \ 1525 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1526 #define DO_SQRDMLSH_S(N, M, A) \ 1527 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1528 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1529 1530 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1531 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1532 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1533 1534 #undef DO_ZZXZ 1535 1536 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1537 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1538 { \ 1539 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1540 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1541 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1542 for (i = 0; i < oprsz; i += 16) { \ 1543 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1544 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1545 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1546 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1547 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1548 } \ 1549 } \ 1550 } 1551 1552 #define DO_MLA(N, M, A) (A + N * M) 1553 1554 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1555 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1556 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1557 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1558 1559 #define DO_MLS(N, M, A) (A - N * M) 1560 1561 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1562 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1563 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1564 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1565 1566 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1567 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1568 1569 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1570 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1571 1572 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1573 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1574 1575 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1576 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1577 1578 #undef DO_MLA 1579 #undef DO_MLS 1580 #undef DO_ZZXW 1581 1582 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1583 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1584 { \ 1585 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1586 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1587 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1588 for (i = 0; i < oprsz; i += 16) { \ 1589 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1590 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1591 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1592 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1593 } \ 1594 } \ 1595 } 1596 1597 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1598 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1599 1600 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1601 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1602 1603 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1604 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1605 1606 #undef DO_ZZX 1607 1608 #define DO_BITPERM(NAME, TYPE, OP) \ 1609 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1610 { \ 1611 intptr_t i, opr_sz = simd_oprsz(desc); \ 1612 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1613 TYPE nn = *(TYPE *)(vn + i); \ 1614 TYPE mm = *(TYPE *)(vm + i); \ 1615 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1616 } \ 1617 } 1618 1619 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1620 { 1621 uint64_t res = 0; 1622 int db, rb = 0; 1623 1624 for (db = 0; db < n; ++db) { 1625 if ((mask >> db) & 1) { 1626 res |= ((data >> db) & 1) << rb; 1627 ++rb; 1628 } 1629 } 1630 return res; 1631 } 1632 1633 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1634 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1635 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1636 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1637 1638 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1639 { 1640 uint64_t res = 0; 1641 int rb, db = 0; 1642 1643 for (rb = 0; rb < n; ++rb) { 1644 if ((mask >> rb) & 1) { 1645 res |= ((data >> db) & 1) << rb; 1646 ++db; 1647 } 1648 } 1649 return res; 1650 } 1651 1652 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1653 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1654 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1655 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1656 1657 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1658 { 1659 uint64_t resm = 0, resu = 0; 1660 int db, rbm = 0, rbu = 0; 1661 1662 for (db = 0; db < n; ++db) { 1663 uint64_t val = (data >> db) & 1; 1664 if ((mask >> db) & 1) { 1665 resm |= val << rbm++; 1666 } else { 1667 resu |= val << rbu++; 1668 } 1669 } 1670 1671 return resm | (resu << rbm); 1672 } 1673 1674 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1675 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1676 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1677 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1678 1679 #undef DO_BITPERM 1680 1681 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1682 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1683 { \ 1684 intptr_t i, opr_sz = simd_oprsz(desc); \ 1685 int sub_r = simd_data(desc); \ 1686 if (sub_r) { \ 1687 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1688 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1689 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1690 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1691 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1692 acc_r = ADD_OP(acc_r, el2_i); \ 1693 acc_i = SUB_OP(acc_i, el2_r); \ 1694 *(TYPE *)(vd + H(i)) = acc_r; \ 1695 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1696 } \ 1697 } else { \ 1698 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1699 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1700 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1701 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1702 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1703 acc_r = SUB_OP(acc_r, el2_i); \ 1704 acc_i = ADD_OP(acc_i, el2_r); \ 1705 *(TYPE *)(vd + H(i)) = acc_r; \ 1706 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1707 } \ 1708 } \ 1709 } 1710 1711 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1712 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1713 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1714 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1715 1716 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1717 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1718 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1719 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1720 1721 #undef DO_CADD 1722 1723 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1724 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1725 { \ 1726 intptr_t i, opr_sz = simd_oprsz(desc); \ 1727 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1728 int shift = simd_data(desc) >> 1; \ 1729 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1730 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1731 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1732 } \ 1733 } 1734 1735 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1736 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1737 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1738 1739 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1740 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1741 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1742 1743 #undef DO_ZZI_SHLL 1744 1745 /* Two-operand reduction expander, controlled by a predicate. 1746 * The difference between TYPERED and TYPERET has to do with 1747 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1748 * but TYPERET must be unsigned so that e.g. a 32-bit value 1749 * is not sign-extended to the ABI uint64_t return type. 1750 */ 1751 /* ??? If we were to vectorize this by hand the reduction ordering 1752 * would change. For integer operands, this is perfectly fine. 1753 */ 1754 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1755 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1756 { \ 1757 intptr_t i, opr_sz = simd_oprsz(desc); \ 1758 TYPERED ret = INIT; \ 1759 for (i = 0; i < opr_sz; ) { \ 1760 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1761 do { \ 1762 if (pg & 1) { \ 1763 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1764 ret = OP(ret, nn); \ 1765 } \ 1766 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1767 } while (i & 15); \ 1768 } \ 1769 return (TYPERET)ret; \ 1770 } 1771 1772 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1773 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1774 { \ 1775 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1776 TYPEE *n = vn; \ 1777 uint8_t *pg = vg; \ 1778 TYPER ret = INIT; \ 1779 for (i = 0; i < opr_sz; i += 1) { \ 1780 if (pg[H1(i)] & 1) { \ 1781 TYPEE nn = n[i]; \ 1782 ret = OP(ret, nn); \ 1783 } \ 1784 } \ 1785 return ret; \ 1786 } 1787 1788 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1789 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1790 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1791 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1792 1793 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1794 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1795 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1796 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1797 1798 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1799 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1800 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1801 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1802 1803 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1804 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1805 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1806 1807 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1808 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1809 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1810 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1811 1812 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1813 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1814 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1815 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1816 1817 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1818 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1819 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1820 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1821 1822 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1823 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1824 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1825 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1826 1827 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1828 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1829 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1830 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1831 1832 #undef DO_VPZ 1833 #undef DO_VPZ_D 1834 1835 /* Two vector operand, one scalar operand, unpredicated. */ 1836 #define DO_ZZI(NAME, TYPE, OP) \ 1837 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1838 { \ 1839 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1840 TYPE s = s64, *d = vd, *n = vn; \ 1841 for (i = 0; i < opr_sz; ++i) { \ 1842 d[i] = OP(n[i], s); \ 1843 } \ 1844 } 1845 1846 #define DO_SUBR(X, Y) (Y - X) 1847 1848 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1849 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1850 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1851 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1852 1853 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1854 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1855 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1856 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1857 1858 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1859 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1860 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1861 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1862 1863 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1864 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1865 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1866 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1867 1868 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1869 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1870 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1871 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1872 1873 #undef DO_ZZI 1874 1875 #undef DO_AND 1876 #undef DO_ORR 1877 #undef DO_EOR 1878 #undef DO_BIC 1879 #undef DO_ADD 1880 #undef DO_SUB 1881 #undef DO_MAX 1882 #undef DO_MIN 1883 #undef DO_ABD 1884 #undef DO_MUL 1885 #undef DO_DIV 1886 #undef DO_ASR 1887 #undef DO_LSR 1888 #undef DO_LSL 1889 #undef DO_SUBR 1890 1891 /* Similar to the ARM LastActiveElement pseudocode function, except the 1892 result is multiplied by the element size. This includes the not found 1893 indication; e.g. not found for esz=3 is -8. */ 1894 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1895 { 1896 uint64_t mask = pred_esz_masks[esz]; 1897 intptr_t i = words; 1898 1899 do { 1900 uint64_t this_g = g[--i] & mask; 1901 if (this_g) { 1902 return i * 64 + (63 - clz64(this_g)); 1903 } 1904 } while (i > 0); 1905 return (intptr_t)-1 << esz; 1906 } 1907 1908 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1909 { 1910 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1911 uint32_t flags = PREDTEST_INIT; 1912 uint64_t *d = vd, *g = vg; 1913 intptr_t i = 0; 1914 1915 do { 1916 uint64_t this_d = d[i]; 1917 uint64_t this_g = g[i]; 1918 1919 if (this_g) { 1920 if (!(flags & 4)) { 1921 /* Set in D the first bit of G. */ 1922 this_d |= this_g & -this_g; 1923 d[i] = this_d; 1924 } 1925 flags = iter_predtest_fwd(this_d, this_g, flags); 1926 } 1927 } while (++i < words); 1928 1929 return flags; 1930 } 1931 1932 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1933 { 1934 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1935 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1936 uint32_t flags = PREDTEST_INIT; 1937 uint64_t *d = vd, *g = vg, esz_mask; 1938 intptr_t i, next; 1939 1940 next = last_active_element(vd, words, esz) + (1 << esz); 1941 esz_mask = pred_esz_masks[esz]; 1942 1943 /* Similar to the pseudocode for pnext, but scaled by ESZ 1944 so that we find the correct bit. */ 1945 if (next < words * 64) { 1946 uint64_t mask = -1; 1947 1948 if (next & 63) { 1949 mask = ~((1ull << (next & 63)) - 1); 1950 next &= -64; 1951 } 1952 do { 1953 uint64_t this_g = g[next / 64] & esz_mask & mask; 1954 if (this_g != 0) { 1955 next = (next & -64) + ctz64(this_g); 1956 break; 1957 } 1958 next += 64; 1959 mask = -1; 1960 } while (next < words * 64); 1961 } 1962 1963 i = 0; 1964 do { 1965 uint64_t this_d = 0; 1966 if (i == next / 64) { 1967 this_d = 1ull << (next & 63); 1968 } 1969 d[i] = this_d; 1970 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1971 } while (++i < words); 1972 1973 return flags; 1974 } 1975 1976 /* 1977 * Copy Zn into Zd, and store zero into inactive elements. 1978 * If inv, store zeros into the active elements. 1979 */ 1980 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1981 { 1982 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1983 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1984 uint64_t *d = vd, *n = vn; 1985 uint8_t *pg = vg; 1986 1987 for (i = 0; i < opr_sz; i += 1) { 1988 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1989 } 1990 } 1991 1992 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1993 { 1994 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1995 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1996 uint64_t *d = vd, *n = vn; 1997 uint8_t *pg = vg; 1998 1999 for (i = 0; i < opr_sz; i += 1) { 2000 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 2001 } 2002 } 2003 2004 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 2005 { 2006 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2007 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2008 uint64_t *d = vd, *n = vn; 2009 uint8_t *pg = vg; 2010 2011 for (i = 0; i < opr_sz; i += 1) { 2012 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 2013 } 2014 } 2015 2016 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 2017 { 2018 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2019 uint64_t *d = vd, *n = vn; 2020 uint8_t *pg = vg; 2021 uint8_t inv = simd_data(desc); 2022 2023 for (i = 0; i < opr_sz; i += 1) { 2024 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2025 } 2026 } 2027 2028 /* Three-operand expander, immediate operand, controlled by a predicate. 2029 */ 2030 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2031 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2032 { \ 2033 intptr_t i, opr_sz = simd_oprsz(desc); \ 2034 TYPE imm = simd_data(desc); \ 2035 for (i = 0; i < opr_sz; ) { \ 2036 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2037 do { \ 2038 if (pg & 1) { \ 2039 TYPE nn = *(TYPE *)(vn + H(i)); \ 2040 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2041 } \ 2042 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2043 } while (i & 15); \ 2044 } \ 2045 } 2046 2047 /* Similarly, specialized for 64-bit operands. */ 2048 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2049 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2050 { \ 2051 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2052 TYPE *d = vd, *n = vn; \ 2053 TYPE imm = simd_data(desc); \ 2054 uint8_t *pg = vg; \ 2055 for (i = 0; i < opr_sz; i += 1) { \ 2056 if (pg[H1(i)] & 1) { \ 2057 TYPE nn = n[i]; \ 2058 d[i] = OP(nn, imm); \ 2059 } \ 2060 } \ 2061 } 2062 2063 #define DO_SHR(N, M) (N >> M) 2064 #define DO_SHL(N, M) (N << M) 2065 2066 /* Arithmetic shift right for division. This rounds negative numbers 2067 toward zero as per signed division. Therefore before shifting, 2068 when N is negative, add 2**M-1. */ 2069 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2070 2071 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2072 { 2073 if (likely(sh < 64)) { 2074 return (x >> sh) + ((x >> (sh - 1)) & 1); 2075 } else if (sh == 64) { 2076 return x >> 63; 2077 } else { 2078 return 0; 2079 } 2080 } 2081 2082 static inline int64_t do_srshr(int64_t x, unsigned sh) 2083 { 2084 if (likely(sh < 64)) { 2085 return (x >> sh) + ((x >> (sh - 1)) & 1); 2086 } else { 2087 /* Rounding the sign bit always produces 0. */ 2088 return 0; 2089 } 2090 } 2091 2092 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2093 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2094 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2095 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2096 2097 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2098 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2099 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2100 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2101 2102 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2103 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2104 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2105 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2106 2107 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2108 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2109 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2110 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2111 2112 /* SVE2 bitwise shift by immediate */ 2113 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2114 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2115 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2116 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2117 2118 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2119 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2120 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2121 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2122 2123 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2124 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2125 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2126 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2127 2128 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2129 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2130 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2131 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2132 2133 #define do_suqrshl_b(n, m) \ 2134 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2135 #define do_suqrshl_h(n, m) \ 2136 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2137 #define do_suqrshl_s(n, m) \ 2138 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2139 #define do_suqrshl_d(n, m) \ 2140 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2141 2142 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2143 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2144 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2145 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2146 2147 #undef DO_ASRD 2148 #undef DO_ZPZI 2149 #undef DO_ZPZI_D 2150 2151 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2152 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2153 { \ 2154 intptr_t i, opr_sz = simd_oprsz(desc); \ 2155 int shift = simd_data(desc); \ 2156 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2157 TYPEW nn = *(TYPEW *)(vn + i); \ 2158 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2159 } \ 2160 } 2161 2162 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2163 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2164 { \ 2165 intptr_t i, opr_sz = simd_oprsz(desc); \ 2166 int shift = simd_data(desc); \ 2167 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2168 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2169 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2170 } \ 2171 } 2172 2173 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2174 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2175 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2176 2177 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2178 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2179 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2180 2181 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2182 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2183 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2184 2185 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2186 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2187 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2188 2189 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2190 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2191 #define DO_SQSHRUN_D(x, sh) \ 2192 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2193 2194 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2195 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2196 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2197 2198 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2199 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2200 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2201 2202 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2203 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2204 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2205 2206 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2207 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2208 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2209 2210 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2211 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2212 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2213 2214 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2215 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2216 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2217 2218 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2219 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2220 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2221 2222 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2223 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2224 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2225 2226 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2227 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2228 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2229 2230 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2231 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2232 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2233 2234 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2235 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2236 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2237 2238 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2239 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2240 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2241 2242 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2243 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2244 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2245 2246 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2247 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2248 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2249 2250 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2251 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2252 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2253 2254 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2255 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2256 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2257 2258 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2259 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2260 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2261 2262 #undef DO_SHRNB 2263 #undef DO_SHRNT 2264 2265 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2266 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2267 { \ 2268 intptr_t i, opr_sz = simd_oprsz(desc); \ 2269 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2270 TYPEW nn = *(TYPEW *)(vn + i); \ 2271 TYPEW mm = *(TYPEW *)(vm + i); \ 2272 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2273 } \ 2274 } 2275 2276 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2277 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2278 { \ 2279 intptr_t i, opr_sz = simd_oprsz(desc); \ 2280 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2281 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2282 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2283 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2284 } \ 2285 } 2286 2287 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2288 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2289 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2290 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2291 2292 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2293 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2294 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2295 2296 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2297 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2298 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2299 2300 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2301 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2302 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2303 2304 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2305 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2306 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2307 2308 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2309 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2310 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2311 2312 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2313 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2314 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2315 2316 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2317 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2318 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2319 2320 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2321 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2322 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2323 2324 #undef DO_RSUBHN 2325 #undef DO_SUBHN 2326 #undef DO_RADDHN 2327 #undef DO_ADDHN 2328 2329 #undef DO_BINOPNB 2330 2331 /* Fully general four-operand expander, controlled by a predicate. 2332 */ 2333 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2334 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2335 void *vg, uint32_t desc) \ 2336 { \ 2337 intptr_t i, opr_sz = simd_oprsz(desc); \ 2338 for (i = 0; i < opr_sz; ) { \ 2339 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2340 do { \ 2341 if (pg & 1) { \ 2342 TYPE nn = *(TYPE *)(vn + H(i)); \ 2343 TYPE mm = *(TYPE *)(vm + H(i)); \ 2344 TYPE aa = *(TYPE *)(va + H(i)); \ 2345 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2346 } \ 2347 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2348 } while (i & 15); \ 2349 } \ 2350 } 2351 2352 /* Similarly, specialized for 64-bit operands. */ 2353 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2354 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2355 void *vg, uint32_t desc) \ 2356 { \ 2357 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2358 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2359 uint8_t *pg = vg; \ 2360 for (i = 0; i < opr_sz; i += 1) { \ 2361 if (pg[H1(i)] & 1) { \ 2362 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2363 d[i] = OP(aa, nn, mm); \ 2364 } \ 2365 } \ 2366 } 2367 2368 #define DO_MLA(A, N, M) (A + N * M) 2369 #define DO_MLS(A, N, M) (A - N * M) 2370 2371 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2372 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2373 2374 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2375 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2376 2377 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2378 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2379 2380 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2381 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2382 2383 #undef DO_MLA 2384 #undef DO_MLS 2385 #undef DO_ZPZZZ 2386 #undef DO_ZPZZZ_D 2387 2388 void HELPER(sve_index_b)(void *vd, uint32_t start, 2389 uint32_t incr, uint32_t desc) 2390 { 2391 intptr_t i, opr_sz = simd_oprsz(desc); 2392 uint8_t *d = vd; 2393 for (i = 0; i < opr_sz; i += 1) { 2394 d[H1(i)] = start + i * incr; 2395 } 2396 } 2397 2398 void HELPER(sve_index_h)(void *vd, uint32_t start, 2399 uint32_t incr, uint32_t desc) 2400 { 2401 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2402 uint16_t *d = vd; 2403 for (i = 0; i < opr_sz; i += 1) { 2404 d[H2(i)] = start + i * incr; 2405 } 2406 } 2407 2408 void HELPER(sve_index_s)(void *vd, uint32_t start, 2409 uint32_t incr, uint32_t desc) 2410 { 2411 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2412 uint32_t *d = vd; 2413 for (i = 0; i < opr_sz; i += 1) { 2414 d[H4(i)] = start + i * incr; 2415 } 2416 } 2417 2418 void HELPER(sve_index_d)(void *vd, uint64_t start, 2419 uint64_t incr, uint32_t desc) 2420 { 2421 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2422 uint64_t *d = vd; 2423 for (i = 0; i < opr_sz; i += 1) { 2424 d[i] = start + i * incr; 2425 } 2426 } 2427 2428 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2429 { 2430 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2431 uint32_t sh = simd_data(desc); 2432 uint32_t *d = vd, *n = vn, *m = vm; 2433 for (i = 0; i < opr_sz; i += 1) { 2434 d[i] = n[i] + (m[i] << sh); 2435 } 2436 } 2437 2438 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2439 { 2440 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2441 uint64_t sh = simd_data(desc); 2442 uint64_t *d = vd, *n = vn, *m = vm; 2443 for (i = 0; i < opr_sz; i += 1) { 2444 d[i] = n[i] + (m[i] << sh); 2445 } 2446 } 2447 2448 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2449 { 2450 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2451 uint64_t sh = simd_data(desc); 2452 uint64_t *d = vd, *n = vn, *m = vm; 2453 for (i = 0; i < opr_sz; i += 1) { 2454 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2455 } 2456 } 2457 2458 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2459 { 2460 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2461 uint64_t sh = simd_data(desc); 2462 uint64_t *d = vd, *n = vn, *m = vm; 2463 for (i = 0; i < opr_sz; i += 1) { 2464 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2465 } 2466 } 2467 2468 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2469 { 2470 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2471 static const uint16_t coeff[] = { 2472 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2473 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2474 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2475 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2476 }; 2477 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2478 uint16_t *d = vd, *n = vn; 2479 2480 for (i = 0; i < opr_sz; i++) { 2481 uint16_t nn = n[i]; 2482 intptr_t idx = extract32(nn, 0, 5); 2483 uint16_t exp = extract32(nn, 5, 5); 2484 d[i] = coeff[idx] | (exp << 10); 2485 } 2486 } 2487 2488 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2489 { 2490 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2491 static const uint32_t coeff[] = { 2492 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2493 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2494 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2495 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2496 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2497 0x1ef532, 0x20b051, 0x227043, 0x243516, 2498 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2499 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2500 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2501 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2502 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2503 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2504 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2505 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2506 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2507 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2508 }; 2509 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2510 uint32_t *d = vd, *n = vn; 2511 2512 for (i = 0; i < opr_sz; i++) { 2513 uint32_t nn = n[i]; 2514 intptr_t idx = extract32(nn, 0, 6); 2515 uint32_t exp = extract32(nn, 6, 8); 2516 d[i] = coeff[idx] | (exp << 23); 2517 } 2518 } 2519 2520 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2521 { 2522 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2523 static const uint64_t coeff[] = { 2524 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2525 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2526 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2527 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2528 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2529 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2530 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2531 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2532 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2533 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2534 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2535 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2536 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2537 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2538 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2539 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2540 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2541 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2542 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2543 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2544 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2545 0xFA7C1819E90D8ull, 2546 }; 2547 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2548 uint64_t *d = vd, *n = vn; 2549 2550 for (i = 0; i < opr_sz; i++) { 2551 uint64_t nn = n[i]; 2552 intptr_t idx = extract32(nn, 0, 6); 2553 uint64_t exp = extract32(nn, 6, 11); 2554 d[i] = coeff[idx] | (exp << 52); 2555 } 2556 } 2557 2558 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2559 { 2560 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2561 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2562 uint16_t *d = vd, *n = vn, *m = vm; 2563 for (i = 0; i < opr_sz; i += 1) { 2564 uint16_t nn = n[i]; 2565 uint16_t mm = m[i]; 2566 if (mm & 1) { 2567 nn = float16_one; 2568 } 2569 if (mm & 2) { 2570 nn = float16_maybe_ah_chs(nn, fpcr_ah); 2571 } 2572 d[i] = nn; 2573 } 2574 } 2575 2576 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2577 { 2578 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2579 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2580 uint32_t *d = vd, *n = vn, *m = vm; 2581 for (i = 0; i < opr_sz; i += 1) { 2582 uint32_t nn = n[i]; 2583 uint32_t mm = m[i]; 2584 if (mm & 1) { 2585 nn = float32_one; 2586 } 2587 if (mm & 2) { 2588 nn = float32_maybe_ah_chs(nn, fpcr_ah); 2589 } 2590 d[i] = nn; 2591 } 2592 } 2593 2594 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2595 { 2596 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2597 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2598 uint64_t *d = vd, *n = vn, *m = vm; 2599 for (i = 0; i < opr_sz; i += 1) { 2600 uint64_t nn = n[i]; 2601 uint64_t mm = m[i]; 2602 if (mm & 1) { 2603 nn = float64_one; 2604 } 2605 if (mm & 2) { 2606 nn = float64_maybe_ah_chs(nn, fpcr_ah); 2607 } 2608 d[i] = nn; 2609 } 2610 } 2611 2612 /* 2613 * Signed saturating addition with scalar operand. 2614 */ 2615 2616 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2617 { 2618 intptr_t i, oprsz = simd_oprsz(desc); 2619 2620 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2621 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2622 } 2623 } 2624 2625 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2626 { 2627 intptr_t i, oprsz = simd_oprsz(desc); 2628 2629 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2630 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2631 } 2632 } 2633 2634 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2635 { 2636 intptr_t i, oprsz = simd_oprsz(desc); 2637 2638 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2639 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2640 } 2641 } 2642 2643 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2644 { 2645 intptr_t i, oprsz = simd_oprsz(desc); 2646 2647 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2648 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2649 } 2650 } 2651 2652 /* 2653 * Unsigned saturating addition with scalar operand. 2654 */ 2655 2656 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2657 { 2658 intptr_t i, oprsz = simd_oprsz(desc); 2659 2660 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2661 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2662 } 2663 } 2664 2665 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2666 { 2667 intptr_t i, oprsz = simd_oprsz(desc); 2668 2669 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2670 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2671 } 2672 } 2673 2674 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2675 { 2676 intptr_t i, oprsz = simd_oprsz(desc); 2677 2678 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2679 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2680 } 2681 } 2682 2683 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2684 { 2685 intptr_t i, oprsz = simd_oprsz(desc); 2686 2687 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2688 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2689 } 2690 } 2691 2692 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2693 { 2694 intptr_t i, oprsz = simd_oprsz(desc); 2695 2696 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2697 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2698 } 2699 } 2700 2701 /* Two operand predicated copy immediate with merge. All valid immediates 2702 * can fit within 17 signed bits in the simd_data field. 2703 */ 2704 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2705 uint64_t mm, uint32_t desc) 2706 { 2707 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2708 uint64_t *d = vd, *n = vn; 2709 uint8_t *pg = vg; 2710 2711 mm = dup_const(MO_8, mm); 2712 for (i = 0; i < opr_sz; i += 1) { 2713 uint64_t nn = n[i]; 2714 uint64_t pp = expand_pred_b(pg[H1(i)]); 2715 d[i] = (mm & pp) | (nn & ~pp); 2716 } 2717 } 2718 2719 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2720 uint64_t mm, uint32_t desc) 2721 { 2722 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2723 uint64_t *d = vd, *n = vn; 2724 uint8_t *pg = vg; 2725 2726 mm = dup_const(MO_16, mm); 2727 for (i = 0; i < opr_sz; i += 1) { 2728 uint64_t nn = n[i]; 2729 uint64_t pp = expand_pred_h(pg[H1(i)]); 2730 d[i] = (mm & pp) | (nn & ~pp); 2731 } 2732 } 2733 2734 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2735 uint64_t mm, uint32_t desc) 2736 { 2737 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2738 uint64_t *d = vd, *n = vn; 2739 uint8_t *pg = vg; 2740 2741 mm = dup_const(MO_32, mm); 2742 for (i = 0; i < opr_sz; i += 1) { 2743 uint64_t nn = n[i]; 2744 uint64_t pp = expand_pred_s(pg[H1(i)]); 2745 d[i] = (mm & pp) | (nn & ~pp); 2746 } 2747 } 2748 2749 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2750 uint64_t mm, uint32_t desc) 2751 { 2752 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2753 uint64_t *d = vd, *n = vn; 2754 uint8_t *pg = vg; 2755 2756 for (i = 0; i < opr_sz; i += 1) { 2757 uint64_t nn = n[i]; 2758 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2759 } 2760 } 2761 2762 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2763 { 2764 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2765 uint64_t *d = vd; 2766 uint8_t *pg = vg; 2767 2768 val = dup_const(MO_8, val); 2769 for (i = 0; i < opr_sz; i += 1) { 2770 d[i] = val & expand_pred_b(pg[H1(i)]); 2771 } 2772 } 2773 2774 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2775 { 2776 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2777 uint64_t *d = vd; 2778 uint8_t *pg = vg; 2779 2780 val = dup_const(MO_16, val); 2781 for (i = 0; i < opr_sz; i += 1) { 2782 d[i] = val & expand_pred_h(pg[H1(i)]); 2783 } 2784 } 2785 2786 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2787 { 2788 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2789 uint64_t *d = vd; 2790 uint8_t *pg = vg; 2791 2792 val = dup_const(MO_32, val); 2793 for (i = 0; i < opr_sz; i += 1) { 2794 d[i] = val & expand_pred_s(pg[H1(i)]); 2795 } 2796 } 2797 2798 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2799 { 2800 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2801 uint64_t *d = vd; 2802 uint8_t *pg = vg; 2803 2804 for (i = 0; i < opr_sz; i += 1) { 2805 d[i] = (pg[H1(i)] & 1 ? val : 0); 2806 } 2807 } 2808 2809 /* Big-endian hosts need to frob the byte indices. If the copy 2810 * happens to be 8-byte aligned, then no frobbing necessary. 2811 */ 2812 static void swap_memmove(void *vd, void *vs, size_t n) 2813 { 2814 uintptr_t d = (uintptr_t)vd; 2815 uintptr_t s = (uintptr_t)vs; 2816 uintptr_t o = (d | s | n) & 7; 2817 size_t i; 2818 2819 #if !HOST_BIG_ENDIAN 2820 o = 0; 2821 #endif 2822 switch (o) { 2823 case 0: 2824 memmove(vd, vs, n); 2825 break; 2826 2827 case 4: 2828 if (d < s || d >= s + n) { 2829 for (i = 0; i < n; i += 4) { 2830 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2831 } 2832 } else { 2833 for (i = n; i > 0; ) { 2834 i -= 4; 2835 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2836 } 2837 } 2838 break; 2839 2840 case 2: 2841 case 6: 2842 if (d < s || d >= s + n) { 2843 for (i = 0; i < n; i += 2) { 2844 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2845 } 2846 } else { 2847 for (i = n; i > 0; ) { 2848 i -= 2; 2849 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2850 } 2851 } 2852 break; 2853 2854 default: 2855 if (d < s || d >= s + n) { 2856 for (i = 0; i < n; i++) { 2857 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2858 } 2859 } else { 2860 for (i = n; i > 0; ) { 2861 i -= 1; 2862 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2863 } 2864 } 2865 break; 2866 } 2867 } 2868 2869 /* Similarly for memset of 0. */ 2870 static void swap_memzero(void *vd, size_t n) 2871 { 2872 uintptr_t d = (uintptr_t)vd; 2873 uintptr_t o = (d | n) & 7; 2874 size_t i; 2875 2876 /* Usually, the first bit of a predicate is set, so N is 0. */ 2877 if (likely(n == 0)) { 2878 return; 2879 } 2880 2881 #if !HOST_BIG_ENDIAN 2882 o = 0; 2883 #endif 2884 switch (o) { 2885 case 0: 2886 memset(vd, 0, n); 2887 break; 2888 2889 case 4: 2890 for (i = 0; i < n; i += 4) { 2891 *(uint32_t *)H1_4(d + i) = 0; 2892 } 2893 break; 2894 2895 case 2: 2896 case 6: 2897 for (i = 0; i < n; i += 2) { 2898 *(uint16_t *)H1_2(d + i) = 0; 2899 } 2900 break; 2901 2902 default: 2903 for (i = 0; i < n; i++) { 2904 *(uint8_t *)H1(d + i) = 0; 2905 } 2906 break; 2907 } 2908 } 2909 2910 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2911 { 2912 intptr_t opr_sz = simd_oprsz(desc); 2913 size_t n_ofs = simd_data(desc); 2914 size_t n_siz = opr_sz - n_ofs; 2915 2916 if (vd != vm) { 2917 swap_memmove(vd, vn + n_ofs, n_siz); 2918 swap_memmove(vd + n_siz, vm, n_ofs); 2919 } else if (vd != vn) { 2920 swap_memmove(vd + n_siz, vd, n_ofs); 2921 swap_memmove(vd, vn + n_ofs, n_siz); 2922 } else { 2923 /* vd == vn == vm. Need temp space. */ 2924 ARMVectorReg tmp; 2925 swap_memmove(&tmp, vm, n_ofs); 2926 swap_memmove(vd, vd + n_ofs, n_siz); 2927 memcpy(vd + n_siz, &tmp, n_ofs); 2928 } 2929 } 2930 2931 #define DO_INSR(NAME, TYPE, H) \ 2932 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2933 { \ 2934 intptr_t opr_sz = simd_oprsz(desc); \ 2935 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2936 *(TYPE *)(vd + H(0)) = val; \ 2937 } 2938 2939 DO_INSR(sve_insr_b, uint8_t, H1) 2940 DO_INSR(sve_insr_h, uint16_t, H1_2) 2941 DO_INSR(sve_insr_s, uint32_t, H1_4) 2942 DO_INSR(sve_insr_d, uint64_t, H1_8) 2943 2944 #undef DO_INSR 2945 2946 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2947 { 2948 intptr_t i, j, opr_sz = simd_oprsz(desc); 2949 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2950 uint64_t f = *(uint64_t *)(vn + i); 2951 uint64_t b = *(uint64_t *)(vn + j); 2952 *(uint64_t *)(vd + i) = bswap64(b); 2953 *(uint64_t *)(vd + j) = bswap64(f); 2954 } 2955 } 2956 2957 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2958 { 2959 intptr_t i, j, opr_sz = simd_oprsz(desc); 2960 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2961 uint64_t f = *(uint64_t *)(vn + i); 2962 uint64_t b = *(uint64_t *)(vn + j); 2963 *(uint64_t *)(vd + i) = hswap64(b); 2964 *(uint64_t *)(vd + j) = hswap64(f); 2965 } 2966 } 2967 2968 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2969 { 2970 intptr_t i, j, opr_sz = simd_oprsz(desc); 2971 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2972 uint64_t f = *(uint64_t *)(vn + i); 2973 uint64_t b = *(uint64_t *)(vn + j); 2974 *(uint64_t *)(vd + i) = rol64(b, 32); 2975 *(uint64_t *)(vd + j) = rol64(f, 32); 2976 } 2977 } 2978 2979 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2980 { 2981 intptr_t i, j, opr_sz = simd_oprsz(desc); 2982 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2983 uint64_t f = *(uint64_t *)(vn + i); 2984 uint64_t b = *(uint64_t *)(vn + j); 2985 *(uint64_t *)(vd + i) = b; 2986 *(uint64_t *)(vd + j) = f; 2987 } 2988 } 2989 2990 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2991 2992 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2993 bool is_tbx, tb_impl_fn *fn) 2994 { 2995 ARMVectorReg scratch; 2996 uintptr_t oprsz = simd_oprsz(desc); 2997 2998 if (unlikely(vd == vn)) { 2999 vn = memcpy(&scratch, vn, oprsz); 3000 } 3001 3002 fn(vd, vn, NULL, vm, oprsz, is_tbx); 3003 } 3004 3005 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 3006 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 3007 { 3008 ARMVectorReg scratch; 3009 uintptr_t oprsz = simd_oprsz(desc); 3010 3011 if (unlikely(vd == vn0)) { 3012 vn0 = memcpy(&scratch, vn0, oprsz); 3013 if (vd == vn1) { 3014 vn1 = vn0; 3015 } 3016 } else if (unlikely(vd == vn1)) { 3017 vn1 = memcpy(&scratch, vn1, oprsz); 3018 } 3019 3020 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 3021 } 3022 3023 #define DO_TB(SUFF, TYPE, H) \ 3024 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 3025 void *vm, uintptr_t oprsz, bool is_tbx) \ 3026 { \ 3027 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 3028 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 3029 for (i = 0; i < nelem; ++i) { \ 3030 TYPE index = indexes[H1(i)], val = 0; \ 3031 if (index < nelem) { \ 3032 val = tbl0[H(index)]; \ 3033 } else { \ 3034 index -= nelem; \ 3035 if (tbl1 && index < nelem) { \ 3036 val = tbl1[H(index)]; \ 3037 } else if (is_tbx) { \ 3038 continue; \ 3039 } \ 3040 } \ 3041 d[H(i)] = val; \ 3042 } \ 3043 } \ 3044 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3045 { \ 3046 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3047 } \ 3048 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3049 void *vm, uint32_t desc) \ 3050 { \ 3051 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3052 } \ 3053 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3054 { \ 3055 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3056 } 3057 3058 DO_TB(b, uint8_t, H1) 3059 DO_TB(h, uint16_t, H2) 3060 DO_TB(s, uint32_t, H4) 3061 DO_TB(d, uint64_t, H8) 3062 3063 #undef DO_TB 3064 3065 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3066 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3067 { \ 3068 intptr_t i, opr_sz = simd_oprsz(desc); \ 3069 TYPED *d = vd; \ 3070 TYPES *n = vn; \ 3071 ARMVectorReg tmp; \ 3072 if (unlikely(vn - vd < opr_sz)) { \ 3073 n = memcpy(&tmp, n, opr_sz / 2); \ 3074 } \ 3075 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3076 d[HD(i)] = n[HS(i)]; \ 3077 } \ 3078 } 3079 3080 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3081 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3082 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3083 3084 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3085 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3086 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3087 3088 #undef DO_UNPK 3089 3090 /* Mask of bits included in the even numbered predicates of width esz. 3091 * We also use this for expand_bits/compress_bits, and so extend the 3092 * same pattern out to 16-bit units. 3093 */ 3094 static const uint64_t even_bit_esz_masks[5] = { 3095 0x5555555555555555ull, 3096 0x3333333333333333ull, 3097 0x0f0f0f0f0f0f0f0full, 3098 0x00ff00ff00ff00ffull, 3099 0x0000ffff0000ffffull, 3100 }; 3101 3102 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3103 * For N==0, this corresponds to the operation that in qemu/bitops.h 3104 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3105 * section 7-2 Shuffling Bits. 3106 */ 3107 static uint64_t expand_bits(uint64_t x, int n) 3108 { 3109 int i; 3110 3111 x &= 0xffffffffu; 3112 for (i = 4; i >= n; i--) { 3113 int sh = 1 << i; 3114 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3115 } 3116 return x; 3117 } 3118 3119 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3120 * For N==0, this corresponds to the operation that in qemu/bitops.h 3121 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3122 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3123 */ 3124 static uint64_t compress_bits(uint64_t x, int n) 3125 { 3126 int i; 3127 3128 for (i = n; i <= 4; i++) { 3129 int sh = 1 << i; 3130 x &= even_bit_esz_masks[i]; 3131 x = (x >> sh) | x; 3132 } 3133 return x & 0xffffffffu; 3134 } 3135 3136 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3137 { 3138 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3139 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3140 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3141 int esize = 1 << esz; 3142 uint64_t *d = vd; 3143 intptr_t i; 3144 3145 if (oprsz <= 8) { 3146 uint64_t nn = *(uint64_t *)vn; 3147 uint64_t mm = *(uint64_t *)vm; 3148 int half = 4 * oprsz; 3149 3150 nn = extract64(nn, high * half, half); 3151 mm = extract64(mm, high * half, half); 3152 nn = expand_bits(nn, esz); 3153 mm = expand_bits(mm, esz); 3154 d[0] = nn | (mm << esize); 3155 } else { 3156 ARMPredicateReg tmp; 3157 3158 /* We produce output faster than we consume input. 3159 Therefore we must be mindful of possible overlap. */ 3160 if (vd == vn) { 3161 vn = memcpy(&tmp, vn, oprsz); 3162 if (vd == vm) { 3163 vm = vn; 3164 } 3165 } else if (vd == vm) { 3166 vm = memcpy(&tmp, vm, oprsz); 3167 } 3168 if (high) { 3169 high = oprsz >> 1; 3170 } 3171 3172 if ((oprsz & 7) == 0) { 3173 uint32_t *n = vn, *m = vm; 3174 high >>= 2; 3175 3176 for (i = 0; i < oprsz / 8; i++) { 3177 uint64_t nn = n[H4(high + i)]; 3178 uint64_t mm = m[H4(high + i)]; 3179 3180 nn = expand_bits(nn, esz); 3181 mm = expand_bits(mm, esz); 3182 d[i] = nn | (mm << esize); 3183 } 3184 } else { 3185 uint8_t *n = vn, *m = vm; 3186 uint16_t *d16 = vd; 3187 3188 for (i = 0; i < oprsz / 2; i++) { 3189 uint16_t nn = n[H1(high + i)]; 3190 uint16_t mm = m[H1(high + i)]; 3191 3192 nn = expand_bits(nn, esz); 3193 mm = expand_bits(mm, esz); 3194 d16[H2(i)] = nn | (mm << esize); 3195 } 3196 } 3197 } 3198 } 3199 3200 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3201 { 3202 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3203 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3204 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3205 uint64_t *d = vd, *n = vn, *m = vm; 3206 uint64_t l, h; 3207 intptr_t i; 3208 3209 if (oprsz <= 8) { 3210 l = compress_bits(n[0] >> odd, esz); 3211 h = compress_bits(m[0] >> odd, esz); 3212 d[0] = l | (h << (4 * oprsz)); 3213 } else { 3214 ARMPredicateReg tmp_m; 3215 intptr_t oprsz_16 = oprsz / 16; 3216 3217 if ((vm - vd) < (uintptr_t)oprsz) { 3218 m = memcpy(&tmp_m, vm, oprsz); 3219 } 3220 3221 for (i = 0; i < oprsz_16; i++) { 3222 l = n[2 * i + 0]; 3223 h = n[2 * i + 1]; 3224 l = compress_bits(l >> odd, esz); 3225 h = compress_bits(h >> odd, esz); 3226 d[i] = l | (h << 32); 3227 } 3228 3229 /* 3230 * For VL which is not a multiple of 512, the results from M do not 3231 * align nicely with the uint64_t for D. Put the aligned results 3232 * from M into TMP_M and then copy it into place afterward. 3233 */ 3234 if (oprsz & 15) { 3235 int final_shift = (oprsz & 15) * 2; 3236 3237 l = n[2 * i + 0]; 3238 h = n[2 * i + 1]; 3239 l = compress_bits(l >> odd, esz); 3240 h = compress_bits(h >> odd, esz); 3241 d[i] = l | (h << final_shift); 3242 3243 for (i = 0; i < oprsz_16; i++) { 3244 l = m[2 * i + 0]; 3245 h = m[2 * i + 1]; 3246 l = compress_bits(l >> odd, esz); 3247 h = compress_bits(h >> odd, esz); 3248 tmp_m.p[i] = l | (h << 32); 3249 } 3250 l = m[2 * i + 0]; 3251 h = m[2 * i + 1]; 3252 l = compress_bits(l >> odd, esz); 3253 h = compress_bits(h >> odd, esz); 3254 tmp_m.p[i] = l | (h << final_shift); 3255 3256 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3257 } else { 3258 for (i = 0; i < oprsz_16; i++) { 3259 l = m[2 * i + 0]; 3260 h = m[2 * i + 1]; 3261 l = compress_bits(l >> odd, esz); 3262 h = compress_bits(h >> odd, esz); 3263 d[oprsz_16 + i] = l | (h << 32); 3264 } 3265 } 3266 } 3267 } 3268 3269 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3270 { 3271 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3272 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3273 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3274 uint64_t *d = vd, *n = vn, *m = vm; 3275 uint64_t mask; 3276 int shr, shl; 3277 intptr_t i; 3278 3279 shl = 1 << esz; 3280 shr = 0; 3281 mask = even_bit_esz_masks[esz]; 3282 if (odd) { 3283 mask <<= shl; 3284 shr = shl; 3285 shl = 0; 3286 } 3287 3288 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3289 uint64_t nn = (n[i] & mask) >> shr; 3290 uint64_t mm = (m[i] & mask) << shl; 3291 d[i] = nn + mm; 3292 } 3293 } 3294 3295 /* Reverse units of 2**N bits. */ 3296 static uint64_t reverse_bits_64(uint64_t x, int n) 3297 { 3298 int i, sh; 3299 3300 x = bswap64(x); 3301 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3302 uint64_t mask = even_bit_esz_masks[i]; 3303 x = ((x & mask) << sh) | ((x >> sh) & mask); 3304 } 3305 return x; 3306 } 3307 3308 static uint8_t reverse_bits_8(uint8_t x, int n) 3309 { 3310 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3311 int i, sh; 3312 3313 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3314 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3315 } 3316 return x; 3317 } 3318 3319 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3320 { 3321 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3322 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3323 intptr_t i, oprsz_2 = oprsz / 2; 3324 3325 if (oprsz <= 8) { 3326 uint64_t l = *(uint64_t *)vn; 3327 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3328 *(uint64_t *)vd = l; 3329 } else if ((oprsz & 15) == 0) { 3330 for (i = 0; i < oprsz_2; i += 8) { 3331 intptr_t ih = oprsz - 8 - i; 3332 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3333 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3334 *(uint64_t *)(vd + i) = h; 3335 *(uint64_t *)(vd + ih) = l; 3336 } 3337 } else { 3338 for (i = 0; i < oprsz_2; i += 1) { 3339 intptr_t il = H1(i); 3340 intptr_t ih = H1(oprsz - 1 - i); 3341 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3342 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3343 *(uint8_t *)(vd + il) = h; 3344 *(uint8_t *)(vd + ih) = l; 3345 } 3346 } 3347 } 3348 3349 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3350 { 3351 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3352 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3353 uint64_t *d = vd; 3354 intptr_t i; 3355 3356 if (oprsz <= 8) { 3357 uint64_t nn = *(uint64_t *)vn; 3358 int half = 4 * oprsz; 3359 3360 nn = extract64(nn, high * half, half); 3361 nn = expand_bits(nn, 0); 3362 d[0] = nn; 3363 } else { 3364 ARMPredicateReg tmp_n; 3365 3366 /* We produce output faster than we consume input. 3367 Therefore we must be mindful of possible overlap. */ 3368 if ((vn - vd) < (uintptr_t)oprsz) { 3369 vn = memcpy(&tmp_n, vn, oprsz); 3370 } 3371 if (high) { 3372 high = oprsz >> 1; 3373 } 3374 3375 if ((oprsz & 7) == 0) { 3376 uint32_t *n = vn; 3377 high >>= 2; 3378 3379 for (i = 0; i < oprsz / 8; i++) { 3380 uint64_t nn = n[H4(high + i)]; 3381 d[i] = expand_bits(nn, 0); 3382 } 3383 } else { 3384 uint16_t *d16 = vd; 3385 uint8_t *n = vn; 3386 3387 for (i = 0; i < oprsz / 2; i++) { 3388 uint16_t nn = n[H1(high + i)]; 3389 d16[H2(i)] = expand_bits(nn, 0); 3390 } 3391 } 3392 } 3393 } 3394 3395 #define DO_ZIP(NAME, TYPE, H) \ 3396 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3397 { \ 3398 intptr_t oprsz = simd_oprsz(desc); \ 3399 intptr_t odd_ofs = simd_data(desc); \ 3400 intptr_t i, oprsz_2 = oprsz / 2; \ 3401 ARMVectorReg tmp_n, tmp_m; \ 3402 /* We produce output faster than we consume input. \ 3403 Therefore we must be mindful of possible overlap. */ \ 3404 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3405 vn = memcpy(&tmp_n, vn, oprsz); \ 3406 } \ 3407 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3408 vm = memcpy(&tmp_m, vm, oprsz); \ 3409 } \ 3410 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3411 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3412 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3413 *(TYPE *)(vm + odd_ofs + H(i)); \ 3414 } \ 3415 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3416 memset(vd + oprsz - 16, 0, 16); \ 3417 } \ 3418 } 3419 3420 DO_ZIP(sve_zip_b, uint8_t, H1) 3421 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3422 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3423 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3424 DO_ZIP(sve2_zip_q, Int128, ) 3425 3426 #define DO_UZP(NAME, TYPE, H) \ 3427 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3428 { \ 3429 intptr_t oprsz = simd_oprsz(desc); \ 3430 intptr_t odd_ofs = simd_data(desc); \ 3431 intptr_t i, p; \ 3432 ARMVectorReg tmp_m; \ 3433 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3434 vm = memcpy(&tmp_m, vm, oprsz); \ 3435 } \ 3436 i = 0, p = odd_ofs; \ 3437 do { \ 3438 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3439 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3440 } while (p < oprsz); \ 3441 p -= oprsz; \ 3442 do { \ 3443 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3444 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3445 } while (p < oprsz); \ 3446 tcg_debug_assert(i == oprsz); \ 3447 } 3448 3449 DO_UZP(sve_uzp_b, uint8_t, H1) 3450 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3451 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3452 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3453 DO_UZP(sve2_uzp_q, Int128, ) 3454 3455 #define DO_TRN(NAME, TYPE, H) \ 3456 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3457 { \ 3458 intptr_t oprsz = simd_oprsz(desc); \ 3459 intptr_t odd_ofs = simd_data(desc); \ 3460 intptr_t i; \ 3461 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3462 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3463 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3464 *(TYPE *)(vd + H(i + 0)) = ae; \ 3465 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3466 } \ 3467 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3468 memset(vd + oprsz - 16, 0, 16); \ 3469 } \ 3470 } 3471 3472 DO_TRN(sve_trn_b, uint8_t, H1) 3473 DO_TRN(sve_trn_h, uint16_t, H1_2) 3474 DO_TRN(sve_trn_s, uint32_t, H1_4) 3475 DO_TRN(sve_trn_d, uint64_t, H1_8) 3476 DO_TRN(sve2_trn_q, Int128, ) 3477 3478 #undef DO_ZIP 3479 #undef DO_UZP 3480 #undef DO_TRN 3481 3482 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3483 { 3484 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3485 uint32_t *d = vd, *n = vn; 3486 uint8_t *pg = vg; 3487 3488 for (i = j = 0; i < opr_sz; i++) { 3489 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3490 d[H4(j)] = n[H4(i)]; 3491 j++; 3492 } 3493 } 3494 for (; j < opr_sz; j++) { 3495 d[H4(j)] = 0; 3496 } 3497 } 3498 3499 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3500 { 3501 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3502 uint64_t *d = vd, *n = vn; 3503 uint8_t *pg = vg; 3504 3505 for (i = j = 0; i < opr_sz; i++) { 3506 if (pg[H1(i)] & 1) { 3507 d[j] = n[i]; 3508 j++; 3509 } 3510 } 3511 for (; j < opr_sz; j++) { 3512 d[j] = 0; 3513 } 3514 } 3515 3516 /* Similar to the ARM LastActiveElement pseudocode function, except the 3517 * result is multiplied by the element size. This includes the not found 3518 * indication; e.g. not found for esz=3 is -8. 3519 */ 3520 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3521 { 3522 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3523 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3524 3525 return last_active_element(vg, words, esz); 3526 } 3527 3528 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3529 { 3530 intptr_t opr_sz = simd_oprsz(desc) / 8; 3531 int esz = simd_data(desc); 3532 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3533 intptr_t i, first_i, last_i; 3534 ARMVectorReg tmp; 3535 3536 first_i = last_i = 0; 3537 first_g = last_g = 0; 3538 3539 /* Find the extent of the active elements within VG. */ 3540 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3541 pg = *(uint64_t *)(vg + i) & mask; 3542 if (pg) { 3543 if (last_g == 0) { 3544 last_g = pg; 3545 last_i = i; 3546 } 3547 first_g = pg; 3548 first_i = i; 3549 } 3550 } 3551 3552 len = 0; 3553 if (first_g != 0) { 3554 first_i = first_i * 8 + ctz64(first_g); 3555 last_i = last_i * 8 + 63 - clz64(last_g); 3556 len = last_i - first_i + (1 << esz); 3557 if (vd == vm) { 3558 vm = memcpy(&tmp, vm, opr_sz * 8); 3559 } 3560 swap_memmove(vd, vn + first_i, len); 3561 } 3562 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3563 } 3564 3565 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3566 void *vg, uint32_t desc) 3567 { 3568 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3569 uint64_t *d = vd, *n = vn, *m = vm; 3570 uint8_t *pg = vg; 3571 3572 for (i = 0; i < opr_sz; i += 1) { 3573 uint64_t nn = n[i], mm = m[i]; 3574 uint64_t pp = expand_pred_b(pg[H1(i)]); 3575 d[i] = (nn & pp) | (mm & ~pp); 3576 } 3577 } 3578 3579 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3580 void *vg, uint32_t desc) 3581 { 3582 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3583 uint64_t *d = vd, *n = vn, *m = vm; 3584 uint8_t *pg = vg; 3585 3586 for (i = 0; i < opr_sz; i += 1) { 3587 uint64_t nn = n[i], mm = m[i]; 3588 uint64_t pp = expand_pred_h(pg[H1(i)]); 3589 d[i] = (nn & pp) | (mm & ~pp); 3590 } 3591 } 3592 3593 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3594 void *vg, uint32_t desc) 3595 { 3596 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3597 uint64_t *d = vd, *n = vn, *m = vm; 3598 uint8_t *pg = vg; 3599 3600 for (i = 0; i < opr_sz; i += 1) { 3601 uint64_t nn = n[i], mm = m[i]; 3602 uint64_t pp = expand_pred_s(pg[H1(i)]); 3603 d[i] = (nn & pp) | (mm & ~pp); 3604 } 3605 } 3606 3607 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3608 void *vg, uint32_t desc) 3609 { 3610 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3611 uint64_t *d = vd, *n = vn, *m = vm; 3612 uint8_t *pg = vg; 3613 3614 for (i = 0; i < opr_sz; i += 1) { 3615 uint64_t nn = n[i], mm = m[i]; 3616 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3617 } 3618 } 3619 3620 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3621 void *vg, uint32_t desc) 3622 { 3623 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3624 Int128 *d = vd, *n = vn, *m = vm; 3625 uint16_t *pg = vg; 3626 3627 for (i = 0; i < opr_sz; i += 1) { 3628 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3629 } 3630 } 3631 3632 /* Two operand comparison controlled by a predicate. 3633 * ??? It is very tempting to want to be able to expand this inline 3634 * with x86 instructions, e.g. 3635 * 3636 * vcmpeqw zm, zn, %ymm0 3637 * vpmovmskb %ymm0, %eax 3638 * and $0x5555, %eax 3639 * and pg, %eax 3640 * 3641 * or even aarch64, e.g. 3642 * 3643 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3644 * cmeq v0.8h, zn, zm 3645 * and v0.8h, v0.8h, mask 3646 * addv h0, v0.8h 3647 * and v0.8b, pg 3648 * 3649 * However, coming up with an abstraction that allows vector inputs and 3650 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3651 * scalar outputs, is tricky. 3652 */ 3653 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3654 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3655 { \ 3656 intptr_t opr_sz = simd_oprsz(desc); \ 3657 uint32_t flags = PREDTEST_INIT; \ 3658 intptr_t i = opr_sz; \ 3659 do { \ 3660 uint64_t out = 0, pg; \ 3661 do { \ 3662 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3663 TYPE nn = *(TYPE *)(vn + H(i)); \ 3664 TYPE mm = *(TYPE *)(vm + H(i)); \ 3665 out |= nn OP mm; \ 3666 } while (i & 63); \ 3667 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3668 out &= pg; \ 3669 *(uint64_t *)(vd + (i >> 3)) = out; \ 3670 flags = iter_predtest_bwd(out, pg, flags); \ 3671 } while (i > 0); \ 3672 return flags; \ 3673 } 3674 3675 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3676 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3677 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3678 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3679 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3680 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3681 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3682 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3683 3684 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3685 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3686 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3687 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3688 3689 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3690 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3691 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3692 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3693 3694 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3695 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3696 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3697 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3698 3699 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3700 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3701 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3702 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3703 3704 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3705 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3706 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3707 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3708 3709 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3710 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3711 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3712 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3713 3714 #undef DO_CMP_PPZZ_B 3715 #undef DO_CMP_PPZZ_H 3716 #undef DO_CMP_PPZZ_S 3717 #undef DO_CMP_PPZZ_D 3718 #undef DO_CMP_PPZZ 3719 3720 /* Similar, but the second source is "wide". */ 3721 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3722 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3723 { \ 3724 intptr_t opr_sz = simd_oprsz(desc); \ 3725 uint32_t flags = PREDTEST_INIT; \ 3726 intptr_t i = opr_sz; \ 3727 do { \ 3728 uint64_t out = 0, pg; \ 3729 do { \ 3730 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3731 do { \ 3732 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3733 TYPE nn = *(TYPE *)(vn + H(i)); \ 3734 out |= nn OP mm; \ 3735 } while (i & 7); \ 3736 } while (i & 63); \ 3737 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3738 out &= pg; \ 3739 *(uint64_t *)(vd + (i >> 3)) = out; \ 3740 flags = iter_predtest_bwd(out, pg, flags); \ 3741 } while (i > 0); \ 3742 return flags; \ 3743 } 3744 3745 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3746 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3747 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3748 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3749 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3750 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3751 3752 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3753 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3754 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3755 3756 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3757 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3758 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3759 3760 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3761 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3762 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3763 3764 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3765 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3766 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3767 3768 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3769 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3770 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3771 3772 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3773 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3774 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3775 3776 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3777 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3778 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3779 3780 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3781 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3782 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3783 3784 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3785 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3786 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3787 3788 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3789 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3790 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3791 3792 #undef DO_CMP_PPZW_B 3793 #undef DO_CMP_PPZW_H 3794 #undef DO_CMP_PPZW_S 3795 #undef DO_CMP_PPZW 3796 3797 /* Similar, but the second source is immediate. */ 3798 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3799 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3800 { \ 3801 intptr_t opr_sz = simd_oprsz(desc); \ 3802 uint32_t flags = PREDTEST_INIT; \ 3803 TYPE mm = simd_data(desc); \ 3804 intptr_t i = opr_sz; \ 3805 do { \ 3806 uint64_t out = 0, pg; \ 3807 do { \ 3808 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3809 TYPE nn = *(TYPE *)(vn + H(i)); \ 3810 out |= nn OP mm; \ 3811 } while (i & 63); \ 3812 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3813 out &= pg; \ 3814 *(uint64_t *)(vd + (i >> 3)) = out; \ 3815 flags = iter_predtest_bwd(out, pg, flags); \ 3816 } while (i > 0); \ 3817 return flags; \ 3818 } 3819 3820 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3821 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3822 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3823 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3824 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3825 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3826 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3827 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3828 3829 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3830 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3831 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3832 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3833 3834 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3835 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3836 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3837 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3838 3839 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3840 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3841 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3842 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3843 3844 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3845 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3846 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3847 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3848 3849 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3850 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3851 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3852 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3853 3854 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3855 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3856 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3857 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3858 3859 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3860 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3861 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3862 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3863 3864 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3865 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3866 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3867 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3868 3869 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3870 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3871 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3872 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3873 3874 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3875 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3876 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3877 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3878 3879 #undef DO_CMP_PPZI_B 3880 #undef DO_CMP_PPZI_H 3881 #undef DO_CMP_PPZI_S 3882 #undef DO_CMP_PPZI_D 3883 #undef DO_CMP_PPZI 3884 3885 /* Similar to the ARM LastActive pseudocode function. */ 3886 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3887 { 3888 intptr_t i; 3889 3890 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3891 uint64_t pg = *(uint64_t *)(vg + i); 3892 if (pg) { 3893 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3894 } 3895 } 3896 return 0; 3897 } 3898 3899 /* Compute a mask into RETB that is true for all G, up to and including 3900 * (if after) or excluding (if !after) the first G & N. 3901 * Return true if BRK found. 3902 */ 3903 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3904 bool brk, bool after) 3905 { 3906 uint64_t b; 3907 3908 if (brk) { 3909 b = 0; 3910 } else if ((g & n) == 0) { 3911 /* For all G, no N are set; break not found. */ 3912 b = g; 3913 } else { 3914 /* Break somewhere in N. Locate it. */ 3915 b = g & n; /* guard true, pred true */ 3916 b = b & -b; /* first such */ 3917 if (after) { 3918 b = b | (b - 1); /* break after same */ 3919 } else { 3920 b = b - 1; /* break before same */ 3921 } 3922 brk = true; 3923 } 3924 3925 *retb = b; 3926 return brk; 3927 } 3928 3929 /* Compute a zeroing BRK. */ 3930 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3931 intptr_t oprsz, bool after) 3932 { 3933 bool brk = false; 3934 intptr_t i; 3935 3936 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3937 uint64_t this_b, this_g = g[i]; 3938 3939 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3940 d[i] = this_b & this_g; 3941 } 3942 } 3943 3944 /* Likewise, but also compute flags. */ 3945 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3946 intptr_t oprsz, bool after) 3947 { 3948 uint32_t flags = PREDTEST_INIT; 3949 bool brk = false; 3950 intptr_t i; 3951 3952 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3953 uint64_t this_b, this_d, this_g = g[i]; 3954 3955 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3956 d[i] = this_d = this_b & this_g; 3957 flags = iter_predtest_fwd(this_d, this_g, flags); 3958 } 3959 return flags; 3960 } 3961 3962 /* Compute a merging BRK. */ 3963 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3964 intptr_t oprsz, bool after) 3965 { 3966 bool brk = false; 3967 intptr_t i; 3968 3969 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3970 uint64_t this_b, this_g = g[i]; 3971 3972 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3973 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3974 } 3975 } 3976 3977 /* Likewise, but also compute flags. */ 3978 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3979 intptr_t oprsz, bool after) 3980 { 3981 uint32_t flags = PREDTEST_INIT; 3982 bool brk = false; 3983 intptr_t i; 3984 3985 for (i = 0; i < oprsz / 8; ++i) { 3986 uint64_t this_b, this_d = d[i], this_g = g[i]; 3987 3988 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3989 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3990 flags = iter_predtest_fwd(this_d, this_g, flags); 3991 } 3992 return flags; 3993 } 3994 3995 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3996 { 3997 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3998 * The compiler should turn this into 4 64-bit integer stores. 3999 */ 4000 memset(d, 0, sizeof(ARMPredicateReg)); 4001 return PREDTEST_INIT; 4002 } 4003 4004 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 4005 uint32_t pred_desc) 4006 { 4007 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4008 if (last_active_pred(vn, vg, oprsz)) { 4009 compute_brk_z(vd, vm, vg, oprsz, true); 4010 } else { 4011 do_zero(vd, oprsz); 4012 } 4013 } 4014 4015 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 4016 uint32_t pred_desc) 4017 { 4018 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4019 if (last_active_pred(vn, vg, oprsz)) { 4020 return compute_brks_z(vd, vm, vg, oprsz, true); 4021 } else { 4022 return do_zero(vd, oprsz); 4023 } 4024 } 4025 4026 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 4027 uint32_t pred_desc) 4028 { 4029 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4030 if (last_active_pred(vn, vg, oprsz)) { 4031 compute_brk_z(vd, vm, vg, oprsz, false); 4032 } else { 4033 do_zero(vd, oprsz); 4034 } 4035 } 4036 4037 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4038 uint32_t pred_desc) 4039 { 4040 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4041 if (last_active_pred(vn, vg, oprsz)) { 4042 return compute_brks_z(vd, vm, vg, oprsz, false); 4043 } else { 4044 return do_zero(vd, oprsz); 4045 } 4046 } 4047 4048 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4049 { 4050 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4051 compute_brk_z(vd, vn, vg, oprsz, true); 4052 } 4053 4054 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4055 { 4056 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4057 return compute_brks_z(vd, vn, vg, oprsz, true); 4058 } 4059 4060 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4061 { 4062 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4063 compute_brk_z(vd, vn, vg, oprsz, false); 4064 } 4065 4066 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4067 { 4068 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4069 return compute_brks_z(vd, vn, vg, oprsz, false); 4070 } 4071 4072 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4073 { 4074 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4075 compute_brk_m(vd, vn, vg, oprsz, true); 4076 } 4077 4078 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4079 { 4080 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4081 return compute_brks_m(vd, vn, vg, oprsz, true); 4082 } 4083 4084 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4085 { 4086 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4087 compute_brk_m(vd, vn, vg, oprsz, false); 4088 } 4089 4090 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4091 { 4092 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4093 return compute_brks_m(vd, vn, vg, oprsz, false); 4094 } 4095 4096 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4097 { 4098 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4099 if (!last_active_pred(vn, vg, oprsz)) { 4100 do_zero(vd, oprsz); 4101 } 4102 } 4103 4104 /* As if PredTest(Ones(PL), D, esz). */ 4105 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4106 uint64_t esz_mask) 4107 { 4108 uint32_t flags = PREDTEST_INIT; 4109 intptr_t i; 4110 4111 for (i = 0; i < oprsz / 8; i++) { 4112 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4113 } 4114 if (oprsz & 7) { 4115 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4116 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4117 } 4118 return flags; 4119 } 4120 4121 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4122 { 4123 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4124 if (last_active_pred(vn, vg, oprsz)) { 4125 return predtest_ones(vd, oprsz, -1); 4126 } else { 4127 return do_zero(vd, oprsz); 4128 } 4129 } 4130 4131 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4132 { 4133 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4134 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4135 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4136 intptr_t i; 4137 4138 for (i = 0; i < words; ++i) { 4139 uint64_t t = n[i] & g[i] & mask; 4140 sum += ctpop64(t); 4141 } 4142 return sum; 4143 } 4144 4145 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4146 { 4147 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4148 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4149 uint64_t esz_mask = pred_esz_masks[esz]; 4150 ARMPredicateReg *d = vd; 4151 uint32_t flags; 4152 intptr_t i; 4153 4154 /* Begin with a zero predicate register. */ 4155 flags = do_zero(d, oprsz); 4156 if (count == 0) { 4157 return flags; 4158 } 4159 4160 /* Set all of the requested bits. */ 4161 for (i = 0; i < count / 64; ++i) { 4162 d->p[i] = esz_mask; 4163 } 4164 if (count & 63) { 4165 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4166 } 4167 4168 return predtest_ones(d, oprsz, esz_mask); 4169 } 4170 4171 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4172 { 4173 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4174 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4175 uint64_t esz_mask = pred_esz_masks[esz]; 4176 ARMPredicateReg *d = vd; 4177 intptr_t i, invcount, oprbits; 4178 uint64_t bits; 4179 4180 if (count == 0) { 4181 return do_zero(d, oprsz); 4182 } 4183 4184 oprbits = oprsz * 8; 4185 tcg_debug_assert(count <= oprbits); 4186 4187 bits = esz_mask; 4188 if (oprbits & 63) { 4189 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4190 } 4191 4192 invcount = oprbits - count; 4193 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4194 d->p[i] = bits; 4195 bits = esz_mask; 4196 } 4197 4198 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4199 4200 while (--i >= 0) { 4201 d->p[i] = 0; 4202 } 4203 4204 return predtest_ones(d, oprsz, esz_mask); 4205 } 4206 4207 /* Recursive reduction on a function; 4208 * C.f. the ARM ARM function ReducePredicated. 4209 * 4210 * While it would be possible to write this without the DATA temporary, 4211 * it is much simpler to process the predicate register this way. 4212 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4213 * little to gain with a more complex non-recursive form. 4214 */ 4215 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4216 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4217 { \ 4218 if (n == 1) { \ 4219 return *data; \ 4220 } else { \ 4221 uintptr_t half = n / 2; \ 4222 TYPE lo = NAME##_reduce(data, status, half); \ 4223 TYPE hi = NAME##_reduce(data + half, status, half); \ 4224 return FUNC(lo, hi, status); \ 4225 } \ 4226 } \ 4227 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ 4228 { \ 4229 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4230 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4231 for (i = 0; i < oprsz; ) { \ 4232 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4233 do { \ 4234 TYPE nn = *(TYPE *)(vn + H(i)); \ 4235 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4236 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4237 } while (i & 15); \ 4238 } \ 4239 for (; i < maxsz; i += sizeof(TYPE)) { \ 4240 *(TYPE *)((void *)data + i) = IDENT; \ 4241 } \ 4242 return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ 4243 } 4244 4245 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero) 4246 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero) 4247 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero) 4248 4249 /* Identity is floatN_default_nan, without the function call. */ 4250 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00) 4251 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000) 4252 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) 4253 4254 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00) 4255 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000) 4256 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) 4257 4258 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity) 4259 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity) 4260 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity) 4261 4262 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity)) 4263 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity)) 4264 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity)) 4265 4266 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) 4267 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) 4268 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) 4269 4270 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh, 4271 float16_chs(float16_infinity)) 4272 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs, 4273 float32_chs(float32_infinity)) 4274 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd, 4275 float64_chs(float64_infinity)) 4276 4277 #undef DO_REDUCE 4278 4279 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4280 float_status *status, uint32_t desc) 4281 { 4282 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4283 float16 result = nn; 4284 4285 do { 4286 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4287 do { 4288 if (pg & 1) { 4289 float16 mm = *(float16 *)(vm + H1_2(i)); 4290 result = float16_add(result, mm, status); 4291 } 4292 i += sizeof(float16), pg >>= sizeof(float16); 4293 } while (i & 15); 4294 } while (i < opr_sz); 4295 4296 return result; 4297 } 4298 4299 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4300 float_status *status, uint32_t desc) 4301 { 4302 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4303 float32 result = nn; 4304 4305 do { 4306 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4307 do { 4308 if (pg & 1) { 4309 float32 mm = *(float32 *)(vm + H1_2(i)); 4310 result = float32_add(result, mm, status); 4311 } 4312 i += sizeof(float32), pg >>= sizeof(float32); 4313 } while (i & 15); 4314 } while (i < opr_sz); 4315 4316 return result; 4317 } 4318 4319 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4320 float_status *status, uint32_t desc) 4321 { 4322 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4323 uint64_t *m = vm; 4324 uint8_t *pg = vg; 4325 4326 for (i = 0; i < opr_sz; i++) { 4327 if (pg[H1(i)] & 1) { 4328 nn = float64_add(nn, m[i], status); 4329 } 4330 } 4331 4332 return nn; 4333 } 4334 4335 /* Fully general three-operand expander, controlled by a predicate, 4336 * With the extra float_status parameter. 4337 */ 4338 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4339 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4340 float_status *status, uint32_t desc) \ 4341 { \ 4342 intptr_t i = simd_oprsz(desc); \ 4343 uint64_t *g = vg; \ 4344 do { \ 4345 uint64_t pg = g[(i - 1) >> 6]; \ 4346 do { \ 4347 i -= sizeof(TYPE); \ 4348 if (likely((pg >> (i & 63)) & 1)) { \ 4349 TYPE nn = *(TYPE *)(vn + H(i)); \ 4350 TYPE mm = *(TYPE *)(vm + H(i)); \ 4351 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4352 } \ 4353 } while (i & 63); \ 4354 } while (i != 0); \ 4355 } 4356 4357 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4358 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4359 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4360 4361 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4362 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4363 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4364 4365 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4366 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4367 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4368 4369 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4370 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4371 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4372 4373 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4374 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4375 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4376 4377 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4378 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4379 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4380 4381 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) 4382 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) 4383 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) 4384 4385 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) 4386 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) 4387 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) 4388 4389 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4390 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4391 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4392 4393 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4394 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4395 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4396 4397 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4398 { 4399 return float16_abs(float16_sub(a, b, s)); 4400 } 4401 4402 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4403 { 4404 return float32_abs(float32_sub(a, b, s)); 4405 } 4406 4407 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4408 { 4409 return float64_abs(float64_sub(a, b, s)); 4410 } 4411 4412 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 4413 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat) 4414 { 4415 float16 r = float16_sub(op1, op2, stat); 4416 return float16_is_any_nan(r) ? r : float16_abs(r); 4417 } 4418 4419 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat) 4420 { 4421 float32 r = float32_sub(op1, op2, stat); 4422 return float32_is_any_nan(r) ? r : float32_abs(r); 4423 } 4424 4425 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat) 4426 { 4427 float64 r = float64_sub(op1, op2, stat); 4428 return float64_is_any_nan(r) ? r : float64_abs(r); 4429 } 4430 4431 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4432 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4433 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4434 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h) 4435 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s) 4436 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d) 4437 4438 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4439 { 4440 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4441 return float64_scalbn(a, b_int, s); 4442 } 4443 4444 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4445 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4446 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4447 4448 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4449 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4450 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4451 4452 #undef DO_ZPZZ_FP 4453 4454 /* Three-operand expander, with one scalar operand, controlled by 4455 * a predicate, with the extra float_status parameter. 4456 */ 4457 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4458 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4459 float_status *status, uint32_t desc) \ 4460 { \ 4461 intptr_t i = simd_oprsz(desc); \ 4462 uint64_t *g = vg; \ 4463 TYPE mm = scalar; \ 4464 do { \ 4465 uint64_t pg = g[(i - 1) >> 6]; \ 4466 do { \ 4467 i -= sizeof(TYPE); \ 4468 if (likely((pg >> (i & 63)) & 1)) { \ 4469 TYPE nn = *(TYPE *)(vn + H(i)); \ 4470 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4471 } \ 4472 } while (i & 63); \ 4473 } while (i != 0); \ 4474 } 4475 4476 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4477 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4478 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4479 4480 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4481 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4482 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4483 4484 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4485 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4486 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4487 4488 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4489 { 4490 return float16_sub(b, a, s); 4491 } 4492 4493 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4494 { 4495 return float32_sub(b, a, s); 4496 } 4497 4498 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4499 { 4500 return float64_sub(b, a, s); 4501 } 4502 4503 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4504 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4505 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4506 4507 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4508 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4509 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4510 4511 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4512 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4513 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4514 4515 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4516 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4517 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4518 4519 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4520 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4521 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4522 4523 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh) 4524 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs) 4525 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd) 4526 4527 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh) 4528 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins) 4529 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind) 4530 4531 /* Fully general two-operand expander, controlled by a predicate, 4532 * With the extra float_status parameter. 4533 */ 4534 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4535 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4536 float_status *status, uint32_t desc) \ 4537 { \ 4538 intptr_t i = simd_oprsz(desc); \ 4539 uint64_t *g = vg; \ 4540 do { \ 4541 uint64_t pg = g[(i - 1) >> 6]; \ 4542 do { \ 4543 i -= sizeof(TYPE); \ 4544 if (likely((pg >> (i & 63)) & 1)) { \ 4545 TYPE nn = *(TYPE *)(vn + H(i)); \ 4546 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4547 } \ 4548 } while (i & 63); \ 4549 } while (i != 0); \ 4550 } 4551 4552 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4553 * FZ16. When converting from fp16, this affects flushing input denormals; 4554 * when converting to fp16, this affects flushing output denormals. 4555 */ 4556 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4557 { 4558 bool save = get_flush_inputs_to_zero(fpst); 4559 float32 ret; 4560 4561 set_flush_inputs_to_zero(false, fpst); 4562 ret = float16_to_float32(f, true, fpst); 4563 set_flush_inputs_to_zero(save, fpst); 4564 return ret; 4565 } 4566 4567 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4568 { 4569 bool save = get_flush_inputs_to_zero(fpst); 4570 float64 ret; 4571 4572 set_flush_inputs_to_zero(false, fpst); 4573 ret = float16_to_float64(f, true, fpst); 4574 set_flush_inputs_to_zero(save, fpst); 4575 return ret; 4576 } 4577 4578 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4579 { 4580 bool save = get_flush_to_zero(fpst); 4581 float16 ret; 4582 4583 set_flush_to_zero(false, fpst); 4584 ret = float32_to_float16(f, true, fpst); 4585 set_flush_to_zero(save, fpst); 4586 return ret; 4587 } 4588 4589 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4590 { 4591 bool save = get_flush_to_zero(fpst); 4592 float16 ret; 4593 4594 set_flush_to_zero(false, fpst); 4595 ret = float64_to_float16(f, true, fpst); 4596 set_flush_to_zero(save, fpst); 4597 return ret; 4598 } 4599 4600 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4601 { 4602 if (float16_is_any_nan(f)) { 4603 float_raise(float_flag_invalid, s); 4604 return 0; 4605 } 4606 return float16_to_int16_round_to_zero(f, s); 4607 } 4608 4609 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4610 { 4611 if (float16_is_any_nan(f)) { 4612 float_raise(float_flag_invalid, s); 4613 return 0; 4614 } 4615 return float16_to_int64_round_to_zero(f, s); 4616 } 4617 4618 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4619 { 4620 if (float32_is_any_nan(f)) { 4621 float_raise(float_flag_invalid, s); 4622 return 0; 4623 } 4624 return float32_to_int64_round_to_zero(f, s); 4625 } 4626 4627 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4628 { 4629 if (float64_is_any_nan(f)) { 4630 float_raise(float_flag_invalid, s); 4631 return 0; 4632 } 4633 return float64_to_int64_round_to_zero(f, s); 4634 } 4635 4636 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4637 { 4638 if (float16_is_any_nan(f)) { 4639 float_raise(float_flag_invalid, s); 4640 return 0; 4641 } 4642 return float16_to_uint16_round_to_zero(f, s); 4643 } 4644 4645 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4646 { 4647 if (float16_is_any_nan(f)) { 4648 float_raise(float_flag_invalid, s); 4649 return 0; 4650 } 4651 return float16_to_uint64_round_to_zero(f, s); 4652 } 4653 4654 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4655 { 4656 if (float32_is_any_nan(f)) { 4657 float_raise(float_flag_invalid, s); 4658 return 0; 4659 } 4660 return float32_to_uint64_round_to_zero(f, s); 4661 } 4662 4663 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4664 { 4665 if (float64_is_any_nan(f)) { 4666 float_raise(float_flag_invalid, s); 4667 return 0; 4668 } 4669 return float64_to_uint64_round_to_zero(f, s); 4670 } 4671 4672 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4673 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4674 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4675 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4676 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4677 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4678 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4679 4680 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4681 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4682 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4683 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4684 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4685 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4686 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4687 4688 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4689 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4690 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4691 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4692 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4693 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4694 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4695 4696 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4697 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4698 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4699 4700 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4701 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4702 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4703 4704 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4705 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4706 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4707 4708 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4709 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4710 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4711 4712 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4713 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4714 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4715 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4716 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4717 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4718 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4719 4720 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4721 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4722 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4723 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4724 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4725 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4726 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4727 4728 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4729 { 4730 /* Extract frac to the top of the uint32_t. */ 4731 uint32_t frac = (uint32_t)a << (16 + 6); 4732 int16_t exp = extract32(a, 10, 5); 4733 4734 if (unlikely(exp == 0)) { 4735 if (frac != 0) { 4736 if (!get_flush_inputs_to_zero(s)) { 4737 /* denormal: bias - fractional_zeros */ 4738 return -15 - clz32(frac); 4739 } 4740 /* flush to zero */ 4741 float_raise(float_flag_input_denormal_flushed, s); 4742 } 4743 } else if (unlikely(exp == 0x1f)) { 4744 if (frac == 0) { 4745 return INT16_MAX; /* infinity */ 4746 } 4747 } else { 4748 /* normal: exp - bias */ 4749 return exp - 15; 4750 } 4751 /* nan or zero */ 4752 float_raise(float_flag_invalid, s); 4753 return INT16_MIN; 4754 } 4755 4756 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4757 { 4758 /* Extract frac to the top of the uint32_t. */ 4759 uint32_t frac = a << 9; 4760 int32_t exp = extract32(a, 23, 8); 4761 4762 if (unlikely(exp == 0)) { 4763 if (frac != 0) { 4764 if (!get_flush_inputs_to_zero(s)) { 4765 /* denormal: bias - fractional_zeros */ 4766 return -127 - clz32(frac); 4767 } 4768 /* flush to zero */ 4769 float_raise(float_flag_input_denormal_flushed, s); 4770 } 4771 } else if (unlikely(exp == 0xff)) { 4772 if (frac == 0) { 4773 return INT32_MAX; /* infinity */ 4774 } 4775 } else { 4776 /* normal: exp - bias */ 4777 return exp - 127; 4778 } 4779 /* nan or zero */ 4780 float_raise(float_flag_invalid, s); 4781 return INT32_MIN; 4782 } 4783 4784 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4785 { 4786 /* Extract frac to the top of the uint64_t. */ 4787 uint64_t frac = a << 12; 4788 int64_t exp = extract64(a, 52, 11); 4789 4790 if (unlikely(exp == 0)) { 4791 if (frac != 0) { 4792 if (!get_flush_inputs_to_zero(s)) { 4793 /* denormal: bias - fractional_zeros */ 4794 return -1023 - clz64(frac); 4795 } 4796 /* flush to zero */ 4797 float_raise(float_flag_input_denormal_flushed, s); 4798 } 4799 } else if (unlikely(exp == 0x7ff)) { 4800 if (frac == 0) { 4801 return INT64_MAX; /* infinity */ 4802 } 4803 } else { 4804 /* normal: exp - bias */ 4805 return exp - 1023; 4806 } 4807 /* nan or zero */ 4808 float_raise(float_flag_invalid, s); 4809 return INT64_MIN; 4810 } 4811 4812 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4813 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4814 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4815 4816 #undef DO_ZPZ_FP 4817 4818 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4819 float_status *status, uint32_t desc, 4820 uint16_t neg1, uint16_t neg3, int flags) 4821 { 4822 intptr_t i = simd_oprsz(desc); 4823 uint64_t *g = vg; 4824 4825 do { 4826 uint64_t pg = g[(i - 1) >> 6]; 4827 do { 4828 i -= 2; 4829 if (likely((pg >> (i & 63)) & 1)) { 4830 float16 e1, e2, e3, r; 4831 4832 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4833 e2 = *(uint16_t *)(vm + H1_2(i)); 4834 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4835 r = float16_muladd(e1, e2, e3, flags, status); 4836 *(uint16_t *)(vd + H1_2(i)) = r; 4837 } 4838 } while (i & 63); 4839 } while (i != 0); 4840 } 4841 4842 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4843 void *vg, float_status *status, uint32_t desc) 4844 { 4845 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4846 } 4847 4848 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4849 void *vg, float_status *status, uint32_t desc) 4850 { 4851 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 4852 } 4853 4854 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4855 void *vg, float_status *status, uint32_t desc) 4856 { 4857 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 4858 } 4859 4860 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4861 void *vg, float_status *status, uint32_t desc) 4862 { 4863 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 4864 } 4865 4866 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4867 void *vg, float_status *status, uint32_t desc) 4868 { 4869 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4870 float_muladd_negate_product); 4871 } 4872 4873 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4874 void *vg, float_status *status, uint32_t desc) 4875 { 4876 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4877 float_muladd_negate_product | float_muladd_negate_c); 4878 } 4879 4880 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4881 void *vg, float_status *status, uint32_t desc) 4882 { 4883 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4884 float_muladd_negate_c); 4885 } 4886 4887 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4888 float_status *status, uint32_t desc, 4889 uint32_t neg1, uint32_t neg3, int flags) 4890 { 4891 intptr_t i = simd_oprsz(desc); 4892 uint64_t *g = vg; 4893 4894 do { 4895 uint64_t pg = g[(i - 1) >> 6]; 4896 do { 4897 i -= 4; 4898 if (likely((pg >> (i & 63)) & 1)) { 4899 float32 e1, e2, e3, r; 4900 4901 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4902 e2 = *(uint32_t *)(vm + H1_4(i)); 4903 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4904 r = float32_muladd(e1, e2, e3, flags, status); 4905 *(uint32_t *)(vd + H1_4(i)) = r; 4906 } 4907 } while (i & 63); 4908 } while (i != 0); 4909 } 4910 4911 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4912 void *vg, float_status *status, uint32_t desc) 4913 { 4914 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4915 } 4916 4917 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4918 void *vg, float_status *status, uint32_t desc) 4919 { 4920 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0); 4921 } 4922 4923 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4924 void *vg, float_status *status, uint32_t desc) 4925 { 4926 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0); 4927 } 4928 4929 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4930 void *vg, float_status *status, uint32_t desc) 4931 { 4932 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0); 4933 } 4934 4935 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4936 void *vg, float_status *status, uint32_t desc) 4937 { 4938 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4939 float_muladd_negate_product); 4940 } 4941 4942 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4943 void *vg, float_status *status, uint32_t desc) 4944 { 4945 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4946 float_muladd_negate_product | float_muladd_negate_c); 4947 } 4948 4949 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4950 void *vg, float_status *status, uint32_t desc) 4951 { 4952 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4953 float_muladd_negate_c); 4954 } 4955 4956 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4957 float_status *status, uint32_t desc, 4958 uint64_t neg1, uint64_t neg3, int flags) 4959 { 4960 intptr_t i = simd_oprsz(desc); 4961 uint64_t *g = vg; 4962 4963 do { 4964 uint64_t pg = g[(i - 1) >> 6]; 4965 do { 4966 i -= 8; 4967 if (likely((pg >> (i & 63)) & 1)) { 4968 float64 e1, e2, e3, r; 4969 4970 e1 = *(uint64_t *)(vn + i) ^ neg1; 4971 e2 = *(uint64_t *)(vm + i); 4972 e3 = *(uint64_t *)(va + i) ^ neg3; 4973 r = float64_muladd(e1, e2, e3, flags, status); 4974 *(uint64_t *)(vd + i) = r; 4975 } 4976 } while (i & 63); 4977 } while (i != 0); 4978 } 4979 4980 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4981 void *vg, float_status *status, uint32_t desc) 4982 { 4983 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4984 } 4985 4986 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4987 void *vg, float_status *status, uint32_t desc) 4988 { 4989 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0); 4990 } 4991 4992 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4993 void *vg, float_status *status, uint32_t desc) 4994 { 4995 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0); 4996 } 4997 4998 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4999 void *vg, float_status *status, uint32_t desc) 5000 { 5001 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0); 5002 } 5003 5004 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5005 void *vg, float_status *status, uint32_t desc) 5006 { 5007 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5008 float_muladd_negate_product); 5009 } 5010 5011 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5012 void *vg, float_status *status, uint32_t desc) 5013 { 5014 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5015 float_muladd_negate_product | float_muladd_negate_c); 5016 } 5017 5018 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5019 void *vg, float_status *status, uint32_t desc) 5020 { 5021 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5022 float_muladd_negate_c); 5023 } 5024 5025 /* Two operand floating-point comparison controlled by a predicate. 5026 * Unlike the integer version, we are not allowed to optimistically 5027 * compare operands, since the comparison may have side effects wrt 5028 * the FPSR. 5029 */ 5030 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 5031 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 5032 float_status *status, uint32_t desc) \ 5033 { \ 5034 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5035 uint64_t *d = vd, *g = vg; \ 5036 do { \ 5037 uint64_t out = 0, pg = g[j]; \ 5038 do { \ 5039 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5040 if (likely((pg >> (i & 63)) & 1)) { \ 5041 TYPE nn = *(TYPE *)(vn + H(i)); \ 5042 TYPE mm = *(TYPE *)(vm + H(i)); \ 5043 out |= OP(TYPE, nn, mm, status); \ 5044 } \ 5045 } while (i & 63); \ 5046 d[j--] = out; \ 5047 } while (i > 0); \ 5048 } 5049 5050 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 5051 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 5052 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 5053 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 5054 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 5055 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 5056 5057 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 5058 DO_FPCMP_PPZZ_H(NAME, OP) \ 5059 DO_FPCMP_PPZZ_S(NAME, OP) \ 5060 DO_FPCMP_PPZZ_D(NAME, OP) 5061 5062 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 5063 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 5064 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 5065 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 5066 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 5067 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 5068 #define DO_FCMUO(TYPE, X, Y, ST) \ 5069 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 5070 #define DO_FACGE(TYPE, X, Y, ST) \ 5071 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 5072 #define DO_FACGT(TYPE, X, Y, ST) \ 5073 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 5074 5075 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 5076 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 5077 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 5078 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 5079 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 5080 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 5081 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 5082 5083 #undef DO_FPCMP_PPZZ_ALL 5084 #undef DO_FPCMP_PPZZ_D 5085 #undef DO_FPCMP_PPZZ_S 5086 #undef DO_FPCMP_PPZZ_H 5087 #undef DO_FPCMP_PPZZ 5088 5089 /* One operand floating-point comparison against zero, controlled 5090 * by a predicate. 5091 */ 5092 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 5093 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 5094 float_status *status, uint32_t desc) \ 5095 { \ 5096 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5097 uint64_t *d = vd, *g = vg; \ 5098 do { \ 5099 uint64_t out = 0, pg = g[j]; \ 5100 do { \ 5101 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5102 if ((pg >> (i & 63)) & 1) { \ 5103 TYPE nn = *(TYPE *)(vn + H(i)); \ 5104 out |= OP(TYPE, nn, 0, status); \ 5105 } \ 5106 } while (i & 63); \ 5107 d[j--] = out; \ 5108 } while (i > 0); \ 5109 } 5110 5111 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 5112 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 5113 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 5114 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 5115 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 5116 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 5117 5118 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 5119 DO_FPCMP_PPZ0_H(NAME, OP) \ 5120 DO_FPCMP_PPZ0_S(NAME, OP) \ 5121 DO_FPCMP_PPZ0_D(NAME, OP) 5122 5123 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 5124 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 5125 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 5126 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 5127 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 5128 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 5129 5130 /* FP Trig Multiply-Add. */ 5131 5132 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 5133 float_status *s, uint32_t desc) 5134 { 5135 static const float16 coeff[16] = { 5136 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5137 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5138 }; 5139 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 5140 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5141 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5142 float16 *d = vd, *n = vn, *m = vm; 5143 5144 for (i = 0; i < opr_sz; i++) { 5145 float16 mm = m[i]; 5146 intptr_t xx = x; 5147 int flags = 0; 5148 5149 if (float16_is_neg(mm)) { 5150 if (fpcr_ah) { 5151 flags = float_muladd_negate_product; 5152 } else { 5153 mm = float16_abs(mm); 5154 } 5155 xx += 8; 5156 } 5157 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s); 5158 } 5159 } 5160 5161 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5162 float_status *s, uint32_t desc) 5163 { 5164 static const float32 coeff[16] = { 5165 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5166 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5167 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5168 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5169 }; 5170 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5171 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5172 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5173 float32 *d = vd, *n = vn, *m = vm; 5174 5175 for (i = 0; i < opr_sz; i++) { 5176 float32 mm = m[i]; 5177 intptr_t xx = x; 5178 int flags = 0; 5179 5180 if (float32_is_neg(mm)) { 5181 if (fpcr_ah) { 5182 flags = float_muladd_negate_product; 5183 } else { 5184 mm = float32_abs(mm); 5185 } 5186 xx += 8; 5187 } 5188 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s); 5189 } 5190 } 5191 5192 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5193 float_status *s, uint32_t desc) 5194 { 5195 static const float64 coeff[16] = { 5196 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5197 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5198 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5199 0x3de5d8408868552full, 0x0000000000000000ull, 5200 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5201 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5202 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5203 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5204 }; 5205 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5206 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5207 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5208 float64 *d = vd, *n = vn, *m = vm; 5209 5210 for (i = 0; i < opr_sz; i++) { 5211 float64 mm = m[i]; 5212 intptr_t xx = x; 5213 int flags = 0; 5214 5215 if (float64_is_neg(mm)) { 5216 if (fpcr_ah) { 5217 flags = float_muladd_negate_product; 5218 } else { 5219 mm = float64_abs(mm); 5220 } 5221 xx += 8; 5222 } 5223 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s); 5224 } 5225 } 5226 5227 /* 5228 * FP Complex Add 5229 */ 5230 5231 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5232 float_status *s, uint32_t desc) 5233 { 5234 intptr_t j, i = simd_oprsz(desc); 5235 uint64_t *g = vg; 5236 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5237 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5238 5239 do { 5240 uint64_t pg = g[(i - 1) >> 6]; 5241 do { 5242 float16 e0, e1, e2, e3; 5243 5244 /* I holds the real index; J holds the imag index. */ 5245 j = i - sizeof(float16); 5246 i -= 2 * sizeof(float16); 5247 5248 e0 = *(float16 *)(vn + H1_2(i)); 5249 e1 = *(float16 *)(vm + H1_2(j)); 5250 e2 = *(float16 *)(vn + H1_2(j)); 5251 e3 = *(float16 *)(vm + H1_2(i)); 5252 5253 if (rot) { 5254 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 5255 } else { 5256 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 5257 } 5258 5259 if (likely((pg >> (i & 63)) & 1)) { 5260 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5261 } 5262 if (likely((pg >> (j & 63)) & 1)) { 5263 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5264 } 5265 } while (i & 63); 5266 } while (i != 0); 5267 } 5268 5269 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5270 float_status *s, uint32_t desc) 5271 { 5272 intptr_t j, i = simd_oprsz(desc); 5273 uint64_t *g = vg; 5274 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5275 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5276 5277 do { 5278 uint64_t pg = g[(i - 1) >> 6]; 5279 do { 5280 float32 e0, e1, e2, e3; 5281 5282 /* I holds the real index; J holds the imag index. */ 5283 j = i - sizeof(float32); 5284 i -= 2 * sizeof(float32); 5285 5286 e0 = *(float32 *)(vn + H1_2(i)); 5287 e1 = *(float32 *)(vm + H1_2(j)); 5288 e2 = *(float32 *)(vn + H1_2(j)); 5289 e3 = *(float32 *)(vm + H1_2(i)); 5290 5291 if (rot) { 5292 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 5293 } else { 5294 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 5295 } 5296 5297 if (likely((pg >> (i & 63)) & 1)) { 5298 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5299 } 5300 if (likely((pg >> (j & 63)) & 1)) { 5301 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5302 } 5303 } while (i & 63); 5304 } while (i != 0); 5305 } 5306 5307 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5308 float_status *s, uint32_t desc) 5309 { 5310 intptr_t j, i = simd_oprsz(desc); 5311 uint64_t *g = vg; 5312 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5313 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5314 5315 do { 5316 uint64_t pg = g[(i - 1) >> 6]; 5317 do { 5318 float64 e0, e1, e2, e3; 5319 5320 /* I holds the real index; J holds the imag index. */ 5321 j = i - sizeof(float64); 5322 i -= 2 * sizeof(float64); 5323 5324 e0 = *(float64 *)(vn + H1_2(i)); 5325 e1 = *(float64 *)(vm + H1_2(j)); 5326 e2 = *(float64 *)(vn + H1_2(j)); 5327 e3 = *(float64 *)(vm + H1_2(i)); 5328 5329 if (rot) { 5330 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 5331 } else { 5332 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 5333 } 5334 5335 if (likely((pg >> (i & 63)) & 1)) { 5336 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5337 } 5338 if (likely((pg >> (j & 63)) & 1)) { 5339 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5340 } 5341 } while (i & 63); 5342 } while (i != 0); 5343 } 5344 5345 /* 5346 * FP Complex Multiply 5347 */ 5348 5349 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5350 void *vg, float_status *status, uint32_t desc) 5351 { 5352 intptr_t j, i = simd_oprsz(desc); 5353 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5354 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5355 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5356 uint32_t negf_real = flip ^ negf_imag; 5357 float16 negx_imag, negx_real; 5358 uint64_t *g = vg; 5359 5360 /* With AH=0, use negx; with AH=1 use negf. */ 5361 negx_real = (negf_real & ~fpcr_ah) << 15; 5362 negx_imag = (negf_imag & ~fpcr_ah) << 15; 5363 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5364 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5365 5366 do { 5367 uint64_t pg = g[(i - 1) >> 6]; 5368 do { 5369 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5370 5371 /* I holds the real index; J holds the imag index. */ 5372 j = i - sizeof(float16); 5373 i -= 2 * sizeof(float16); 5374 5375 nr = *(float16 *)(vn + H1_2(i)); 5376 ni = *(float16 *)(vn + H1_2(j)); 5377 mr = *(float16 *)(vm + H1_2(i)); 5378 mi = *(float16 *)(vm + H1_2(j)); 5379 5380 e2 = (flip ? ni : nr); 5381 e1 = (flip ? mi : mr) ^ negx_real; 5382 e4 = e2; 5383 e3 = (flip ? mr : mi) ^ negx_imag; 5384 5385 if (likely((pg >> (i & 63)) & 1)) { 5386 d = *(float16 *)(va + H1_2(i)); 5387 d = float16_muladd(e2, e1, d, negf_real, status); 5388 *(float16 *)(vd + H1_2(i)) = d; 5389 } 5390 if (likely((pg >> (j & 63)) & 1)) { 5391 d = *(float16 *)(va + H1_2(j)); 5392 d = float16_muladd(e4, e3, d, negf_imag, status); 5393 *(float16 *)(vd + H1_2(j)) = d; 5394 } 5395 } while (i & 63); 5396 } while (i != 0); 5397 } 5398 5399 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5400 void *vg, float_status *status, uint32_t desc) 5401 { 5402 intptr_t j, i = simd_oprsz(desc); 5403 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5404 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5405 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5406 uint32_t negf_real = flip ^ negf_imag; 5407 float32 negx_imag, negx_real; 5408 uint64_t *g = vg; 5409 5410 /* With AH=0, use negx; with AH=1 use negf. */ 5411 negx_real = (negf_real & ~fpcr_ah) << 31; 5412 negx_imag = (negf_imag & ~fpcr_ah) << 31; 5413 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5414 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5415 5416 do { 5417 uint64_t pg = g[(i - 1) >> 6]; 5418 do { 5419 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5420 5421 /* I holds the real index; J holds the imag index. */ 5422 j = i - sizeof(float32); 5423 i -= 2 * sizeof(float32); 5424 5425 nr = *(float32 *)(vn + H1_2(i)); 5426 ni = *(float32 *)(vn + H1_2(j)); 5427 mr = *(float32 *)(vm + H1_2(i)); 5428 mi = *(float32 *)(vm + H1_2(j)); 5429 5430 e2 = (flip ? ni : nr); 5431 e1 = (flip ? mi : mr) ^ negx_real; 5432 e4 = e2; 5433 e3 = (flip ? mr : mi) ^ negx_imag; 5434 5435 if (likely((pg >> (i & 63)) & 1)) { 5436 d = *(float32 *)(va + H1_2(i)); 5437 d = float32_muladd(e2, e1, d, negf_real, status); 5438 *(float32 *)(vd + H1_2(i)) = d; 5439 } 5440 if (likely((pg >> (j & 63)) & 1)) { 5441 d = *(float32 *)(va + H1_2(j)); 5442 d = float32_muladd(e4, e3, d, negf_imag, status); 5443 *(float32 *)(vd + H1_2(j)) = d; 5444 } 5445 } while (i & 63); 5446 } while (i != 0); 5447 } 5448 5449 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5450 void *vg, float_status *status, uint32_t desc) 5451 { 5452 intptr_t j, i = simd_oprsz(desc); 5453 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5454 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5455 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5456 uint32_t negf_real = flip ^ negf_imag; 5457 float64 negx_imag, negx_real; 5458 uint64_t *g = vg; 5459 5460 /* With AH=0, use negx; with AH=1 use negf. */ 5461 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 5462 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 5463 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5464 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5465 5466 do { 5467 uint64_t pg = g[(i - 1) >> 6]; 5468 do { 5469 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5470 5471 /* I holds the real index; J holds the imag index. */ 5472 j = i - sizeof(float64); 5473 i -= 2 * sizeof(float64); 5474 5475 nr = *(float64 *)(vn + H1_2(i)); 5476 ni = *(float64 *)(vn + H1_2(j)); 5477 mr = *(float64 *)(vm + H1_2(i)); 5478 mi = *(float64 *)(vm + H1_2(j)); 5479 5480 e2 = (flip ? ni : nr); 5481 e1 = (flip ? mi : mr) ^ negx_real; 5482 e4 = e2; 5483 e3 = (flip ? mr : mi) ^ negx_imag; 5484 5485 if (likely((pg >> (i & 63)) & 1)) { 5486 d = *(float64 *)(va + H1_2(i)); 5487 d = float64_muladd(e2, e1, d, negf_real, status); 5488 *(float64 *)(vd + H1_2(i)) = d; 5489 } 5490 if (likely((pg >> (j & 63)) & 1)) { 5491 d = *(float64 *)(va + H1_2(j)); 5492 d = float64_muladd(e4, e3, d, negf_imag, status); 5493 *(float64 *)(vd + H1_2(j)) = d; 5494 } 5495 } while (i & 63); 5496 } while (i != 0); 5497 } 5498 5499 /* 5500 * Load contiguous data, protected by a governing predicate. 5501 */ 5502 5503 /* 5504 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5505 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5506 * element >= @reg_off, or @reg_max if there were no active elements at all. 5507 */ 5508 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5509 intptr_t reg_max, int esz) 5510 { 5511 uint64_t pg_mask = pred_esz_masks[esz]; 5512 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5513 5514 /* In normal usage, the first element is active. */ 5515 if (likely(pg & 1)) { 5516 return reg_off; 5517 } 5518 5519 if (pg == 0) { 5520 reg_off &= -64; 5521 do { 5522 reg_off += 64; 5523 if (unlikely(reg_off >= reg_max)) { 5524 /* The entire predicate was false. */ 5525 return reg_max; 5526 } 5527 pg = vg[reg_off >> 6] & pg_mask; 5528 } while (pg == 0); 5529 } 5530 reg_off += ctz64(pg); 5531 5532 /* We should never see an out of range predicate bit set. */ 5533 tcg_debug_assert(reg_off < reg_max); 5534 return reg_off; 5535 } 5536 5537 /* 5538 * Resolve the guest virtual address to info->host and info->flags. 5539 * If @nofault, return false if the page is invalid, otherwise 5540 * exit via page fault exception. 5541 */ 5542 5543 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5544 target_ulong addr, int mem_off, MMUAccessType access_type, 5545 int mmu_idx, uintptr_t retaddr) 5546 { 5547 int flags; 5548 5549 addr += mem_off; 5550 5551 /* 5552 * User-only currently always issues with TBI. See the comment 5553 * above useronly_clean_ptr. Usually we clean this top byte away 5554 * during translation, but we can't do that for e.g. vector + imm 5555 * addressing modes. 5556 * 5557 * We currently always enable TBI for user-only, and do not provide 5558 * a way to turn it off. So clean the pointer unconditionally here, 5559 * rather than look it up here, or pass it down from above. 5560 */ 5561 addr = useronly_clean_ptr(addr); 5562 5563 #ifdef CONFIG_USER_ONLY 5564 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5565 &info->host, retaddr); 5566 #else 5567 CPUTLBEntryFull *full; 5568 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5569 &info->host, &full, retaddr); 5570 #endif 5571 info->flags = flags; 5572 5573 if (flags & TLB_INVALID_MASK) { 5574 g_assert(nofault); 5575 return false; 5576 } 5577 5578 #ifdef CONFIG_USER_ONLY 5579 memset(&info->attrs, 0, sizeof(info->attrs)); 5580 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5581 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5582 #else 5583 info->attrs = full->attrs; 5584 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5585 #endif 5586 5587 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5588 info->host -= mem_off; 5589 return true; 5590 } 5591 5592 /* 5593 * Find first active element on each page, and a loose bound for the 5594 * final element on each page. Identify any single element that spans 5595 * the page boundary. Return true if there are any active elements. 5596 */ 5597 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5598 intptr_t reg_max, int esz, int msize) 5599 { 5600 const int esize = 1 << esz; 5601 const uint64_t pg_mask = pred_esz_masks[esz]; 5602 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5603 intptr_t mem_off_last, mem_off_split; 5604 intptr_t page_split, elt_split; 5605 intptr_t i; 5606 5607 /* Set all of the element indices to -1, and the TLB data to 0. */ 5608 memset(info, -1, offsetof(SVEContLdSt, page)); 5609 memset(info->page, 0, sizeof(info->page)); 5610 5611 /* Gross scan over the entire predicate to find bounds. */ 5612 i = 0; 5613 do { 5614 uint64_t pg = vg[i] & pg_mask; 5615 if (pg) { 5616 reg_off_last = i * 64 + 63 - clz64(pg); 5617 if (reg_off_first < 0) { 5618 reg_off_first = i * 64 + ctz64(pg); 5619 } 5620 } 5621 } while (++i * 64 < reg_max); 5622 5623 if (unlikely(reg_off_first < 0)) { 5624 /* No active elements, no pages touched. */ 5625 return false; 5626 } 5627 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5628 5629 info->reg_off_first[0] = reg_off_first; 5630 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5631 mem_off_last = (reg_off_last >> esz) * msize; 5632 5633 page_split = -(addr | TARGET_PAGE_MASK); 5634 if (likely(mem_off_last + msize <= page_split)) { 5635 /* The entire operation fits within a single page. */ 5636 info->reg_off_last[0] = reg_off_last; 5637 return true; 5638 } 5639 5640 info->page_split = page_split; 5641 elt_split = page_split / msize; 5642 reg_off_split = elt_split << esz; 5643 mem_off_split = elt_split * msize; 5644 5645 /* 5646 * This is the last full element on the first page, but it is not 5647 * necessarily active. If there is no full element, i.e. the first 5648 * active element is the one that's split, this value remains -1. 5649 * It is useful as iteration bounds. 5650 */ 5651 if (elt_split != 0) { 5652 info->reg_off_last[0] = reg_off_split - esize; 5653 } 5654 5655 /* Determine if an unaligned element spans the pages. */ 5656 if (page_split % msize != 0) { 5657 /* It is helpful to know if the split element is active. */ 5658 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5659 info->reg_off_split = reg_off_split; 5660 info->mem_off_split = mem_off_split; 5661 5662 if (reg_off_split == reg_off_last) { 5663 /* The page crossing element is last. */ 5664 return true; 5665 } 5666 } 5667 reg_off_split += esize; 5668 mem_off_split += msize; 5669 } 5670 5671 /* 5672 * We do want the first active element on the second page, because 5673 * this may affect the address reported in an exception. 5674 */ 5675 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5676 tcg_debug_assert(reg_off_split <= reg_off_last); 5677 info->reg_off_first[1] = reg_off_split; 5678 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5679 info->reg_off_last[1] = reg_off_last; 5680 return true; 5681 } 5682 5683 /* 5684 * Resolve the guest virtual addresses to info->page[]. 5685 * Control the generation of page faults with @fault. Return false if 5686 * there is no work to do, which can only happen with @fault == FAULT_NO. 5687 */ 5688 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5689 CPUARMState *env, target_ulong addr, 5690 MMUAccessType access_type, uintptr_t retaddr) 5691 { 5692 int mmu_idx = arm_env_mmu_index(env); 5693 int mem_off = info->mem_off_first[0]; 5694 bool nofault = fault == FAULT_NO; 5695 bool have_work = true; 5696 5697 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5698 access_type, mmu_idx, retaddr)) { 5699 /* No work to be done. */ 5700 return false; 5701 } 5702 5703 if (likely(info->page_split < 0)) { 5704 /* The entire operation was on the one page. */ 5705 return true; 5706 } 5707 5708 /* 5709 * If the second page is invalid, then we want the fault address to be 5710 * the first byte on that page which is accessed. 5711 */ 5712 if (info->mem_off_split >= 0) { 5713 /* 5714 * There is an element split across the pages. The fault address 5715 * should be the first byte of the second page. 5716 */ 5717 mem_off = info->page_split; 5718 /* 5719 * If the split element is also the first active element 5720 * of the vector, then: For first-fault we should continue 5721 * to generate faults for the second page. For no-fault, 5722 * we have work only if the second page is valid. 5723 */ 5724 if (info->mem_off_first[0] < info->mem_off_split) { 5725 nofault = FAULT_FIRST; 5726 have_work = false; 5727 } 5728 } else { 5729 /* 5730 * There is no element split across the pages. The fault address 5731 * should be the first active element on the second page. 5732 */ 5733 mem_off = info->mem_off_first[1]; 5734 /* 5735 * There must have been one active element on the first page, 5736 * so we're out of first-fault territory. 5737 */ 5738 nofault = fault != FAULT_ALL; 5739 } 5740 5741 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5742 access_type, mmu_idx, retaddr); 5743 return have_work; 5744 } 5745 5746 #ifndef CONFIG_USER_ONLY 5747 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5748 uint64_t *vg, target_ulong addr, 5749 int esize, int msize, int wp_access, 5750 uintptr_t retaddr) 5751 { 5752 intptr_t mem_off, reg_off, reg_last; 5753 int flags0 = info->page[0].flags; 5754 int flags1 = info->page[1].flags; 5755 5756 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5757 return; 5758 } 5759 5760 /* Indicate that watchpoints are handled. */ 5761 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5762 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5763 5764 if (flags0 & TLB_WATCHPOINT) { 5765 mem_off = info->mem_off_first[0]; 5766 reg_off = info->reg_off_first[0]; 5767 reg_last = info->reg_off_last[0]; 5768 5769 while (reg_off <= reg_last) { 5770 uint64_t pg = vg[reg_off >> 6]; 5771 do { 5772 if ((pg >> (reg_off & 63)) & 1) { 5773 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5774 msize, info->page[0].attrs, 5775 wp_access, retaddr); 5776 } 5777 reg_off += esize; 5778 mem_off += msize; 5779 } while (reg_off <= reg_last && (reg_off & 63)); 5780 } 5781 } 5782 5783 mem_off = info->mem_off_split; 5784 if (mem_off >= 0) { 5785 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5786 info->page[0].attrs, wp_access, retaddr); 5787 } 5788 5789 mem_off = info->mem_off_first[1]; 5790 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5791 reg_off = info->reg_off_first[1]; 5792 reg_last = info->reg_off_last[1]; 5793 5794 do { 5795 uint64_t pg = vg[reg_off >> 6]; 5796 do { 5797 if ((pg >> (reg_off & 63)) & 1) { 5798 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5799 msize, info->page[1].attrs, 5800 wp_access, retaddr); 5801 } 5802 reg_off += esize; 5803 mem_off += msize; 5804 } while (reg_off & 63); 5805 } while (reg_off <= reg_last); 5806 } 5807 } 5808 #endif 5809 5810 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5811 uint64_t *vg, target_ulong addr, int esize, 5812 int msize, uint32_t mtedesc, uintptr_t ra) 5813 { 5814 intptr_t mem_off, reg_off, reg_last; 5815 5816 /* Process the page only if MemAttr == Tagged. */ 5817 if (info->page[0].tagged) { 5818 mem_off = info->mem_off_first[0]; 5819 reg_off = info->reg_off_first[0]; 5820 reg_last = info->reg_off_split; 5821 if (reg_last < 0) { 5822 reg_last = info->reg_off_last[0]; 5823 } 5824 5825 do { 5826 uint64_t pg = vg[reg_off >> 6]; 5827 do { 5828 if ((pg >> (reg_off & 63)) & 1) { 5829 mte_check(env, mtedesc, addr, ra); 5830 } 5831 reg_off += esize; 5832 mem_off += msize; 5833 } while (reg_off <= reg_last && (reg_off & 63)); 5834 } while (reg_off <= reg_last); 5835 } 5836 5837 mem_off = info->mem_off_first[1]; 5838 if (mem_off >= 0 && info->page[1].tagged) { 5839 reg_off = info->reg_off_first[1]; 5840 reg_last = info->reg_off_last[1]; 5841 5842 do { 5843 uint64_t pg = vg[reg_off >> 6]; 5844 do { 5845 if ((pg >> (reg_off & 63)) & 1) { 5846 mte_check(env, mtedesc, addr, ra); 5847 } 5848 reg_off += esize; 5849 mem_off += msize; 5850 } while (reg_off & 63); 5851 } while (reg_off <= reg_last); 5852 } 5853 } 5854 5855 /* 5856 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5857 */ 5858 static inline QEMU_ALWAYS_INLINE 5859 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5860 uint32_t desc, const uintptr_t retaddr, 5861 const int esz, const int msz, const int N, uint32_t mtedesc, 5862 sve_ldst1_host_fn *host_fn, 5863 sve_ldst1_tlb_fn *tlb_fn) 5864 { 5865 const unsigned rd = simd_data(desc); 5866 const intptr_t reg_max = simd_oprsz(desc); 5867 intptr_t reg_off, reg_last, mem_off; 5868 SVEContLdSt info; 5869 void *host; 5870 int flags, i; 5871 5872 /* Find the active elements. */ 5873 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5874 /* The entire predicate was false; no load occurs. */ 5875 for (i = 0; i < N; ++i) { 5876 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5877 } 5878 return; 5879 } 5880 5881 /* Probe the page(s). Exit with exception for any invalid page. */ 5882 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5883 5884 /* Handle watchpoints for all active elements. */ 5885 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5886 BP_MEM_READ, retaddr); 5887 5888 /* 5889 * Handle mte checks for all active elements. 5890 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5891 */ 5892 if (mtedesc) { 5893 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5894 mtedesc, retaddr); 5895 } 5896 5897 flags = info.page[0].flags | info.page[1].flags; 5898 if (unlikely(flags != 0)) { 5899 /* 5900 * At least one page includes MMIO. 5901 * Any bus operation can fail with cpu_transaction_failed, 5902 * which for ARM will raise SyncExternal. Perform the load 5903 * into scratch memory to preserve register state until the end. 5904 */ 5905 ARMVectorReg scratch[4] = { }; 5906 5907 mem_off = info.mem_off_first[0]; 5908 reg_off = info.reg_off_first[0]; 5909 reg_last = info.reg_off_last[1]; 5910 if (reg_last < 0) { 5911 reg_last = info.reg_off_split; 5912 if (reg_last < 0) { 5913 reg_last = info.reg_off_last[0]; 5914 } 5915 } 5916 5917 do { 5918 uint64_t pg = vg[reg_off >> 6]; 5919 do { 5920 if ((pg >> (reg_off & 63)) & 1) { 5921 for (i = 0; i < N; ++i) { 5922 tlb_fn(env, &scratch[i], reg_off, 5923 addr + mem_off + (i << msz), retaddr); 5924 } 5925 } 5926 reg_off += 1 << esz; 5927 mem_off += N << msz; 5928 } while (reg_off & 63); 5929 } while (reg_off <= reg_last); 5930 5931 for (i = 0; i < N; ++i) { 5932 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5933 } 5934 return; 5935 } 5936 5937 /* The entire operation is in RAM, on valid pages. */ 5938 5939 for (i = 0; i < N; ++i) { 5940 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5941 } 5942 5943 mem_off = info.mem_off_first[0]; 5944 reg_off = info.reg_off_first[0]; 5945 reg_last = info.reg_off_last[0]; 5946 host = info.page[0].host; 5947 5948 set_helper_retaddr(retaddr); 5949 5950 while (reg_off <= reg_last) { 5951 uint64_t pg = vg[reg_off >> 6]; 5952 do { 5953 if ((pg >> (reg_off & 63)) & 1) { 5954 for (i = 0; i < N; ++i) { 5955 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5956 host + mem_off + (i << msz)); 5957 } 5958 } 5959 reg_off += 1 << esz; 5960 mem_off += N << msz; 5961 } while (reg_off <= reg_last && (reg_off & 63)); 5962 } 5963 5964 clear_helper_retaddr(); 5965 5966 /* 5967 * Use the slow path to manage the cross-page misalignment. 5968 * But we know this is RAM and cannot trap. 5969 */ 5970 mem_off = info.mem_off_split; 5971 if (unlikely(mem_off >= 0)) { 5972 reg_off = info.reg_off_split; 5973 for (i = 0; i < N; ++i) { 5974 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5975 addr + mem_off + (i << msz), retaddr); 5976 } 5977 } 5978 5979 mem_off = info.mem_off_first[1]; 5980 if (unlikely(mem_off >= 0)) { 5981 reg_off = info.reg_off_first[1]; 5982 reg_last = info.reg_off_last[1]; 5983 host = info.page[1].host; 5984 5985 set_helper_retaddr(retaddr); 5986 5987 do { 5988 uint64_t pg = vg[reg_off >> 6]; 5989 do { 5990 if ((pg >> (reg_off & 63)) & 1) { 5991 for (i = 0; i < N; ++i) { 5992 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5993 host + mem_off + (i << msz)); 5994 } 5995 } 5996 reg_off += 1 << esz; 5997 mem_off += N << msz; 5998 } while (reg_off & 63); 5999 } while (reg_off <= reg_last); 6000 6001 clear_helper_retaddr(); 6002 } 6003 } 6004 6005 static inline QEMU_ALWAYS_INLINE 6006 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6007 uint32_t desc, const uintptr_t ra, 6008 const int esz, const int msz, const int N, 6009 sve_ldst1_host_fn *host_fn, 6010 sve_ldst1_tlb_fn *tlb_fn) 6011 { 6012 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6013 int bit55 = extract64(addr, 55, 1); 6014 6015 /* Remove mtedesc from the normal sve descriptor. */ 6016 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6017 6018 /* Perform gross MTE suppression early. */ 6019 if (!tbi_check(mtedesc, bit55) || 6020 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6021 mtedesc = 0; 6022 } 6023 6024 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6025 } 6026 6027 #define DO_LD1_1(NAME, ESZ) \ 6028 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 6029 target_ulong addr, uint32_t desc) \ 6030 { \ 6031 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 6032 sve_##NAME##_host, sve_##NAME##_tlb); \ 6033 } \ 6034 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6035 target_ulong addr, uint32_t desc) \ 6036 { \ 6037 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 6038 sve_##NAME##_host, sve_##NAME##_tlb); \ 6039 } 6040 6041 #define DO_LD1_2(NAME, ESZ, MSZ) \ 6042 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 6043 target_ulong addr, uint32_t desc) \ 6044 { \ 6045 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6046 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6047 } \ 6048 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 6049 target_ulong addr, uint32_t desc) \ 6050 { \ 6051 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6052 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6053 } \ 6054 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6055 target_ulong addr, uint32_t desc) \ 6056 { \ 6057 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6058 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6059 } \ 6060 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6061 target_ulong addr, uint32_t desc) \ 6062 { \ 6063 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6064 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6065 } 6066 6067 DO_LD1_1(ld1bb, MO_8) 6068 DO_LD1_1(ld1bhu, MO_16) 6069 DO_LD1_1(ld1bhs, MO_16) 6070 DO_LD1_1(ld1bsu, MO_32) 6071 DO_LD1_1(ld1bss, MO_32) 6072 DO_LD1_1(ld1bdu, MO_64) 6073 DO_LD1_1(ld1bds, MO_64) 6074 6075 DO_LD1_2(ld1hh, MO_16, MO_16) 6076 DO_LD1_2(ld1hsu, MO_32, MO_16) 6077 DO_LD1_2(ld1hss, MO_32, MO_16) 6078 DO_LD1_2(ld1hdu, MO_64, MO_16) 6079 DO_LD1_2(ld1hds, MO_64, MO_16) 6080 6081 DO_LD1_2(ld1ss, MO_32, MO_32) 6082 DO_LD1_2(ld1sdu, MO_64, MO_32) 6083 DO_LD1_2(ld1sds, MO_64, MO_32) 6084 6085 DO_LD1_2(ld1dd, MO_64, MO_64) 6086 6087 #undef DO_LD1_1 6088 #undef DO_LD1_2 6089 6090 #define DO_LDN_1(N) \ 6091 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 6092 target_ulong addr, uint32_t desc) \ 6093 { \ 6094 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 6095 sve_ld1bb_host, sve_ld1bb_tlb); \ 6096 } \ 6097 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 6098 target_ulong addr, uint32_t desc) \ 6099 { \ 6100 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 6101 sve_ld1bb_host, sve_ld1bb_tlb); \ 6102 } 6103 6104 #define DO_LDN_2(N, SUFF, ESZ) \ 6105 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 6106 target_ulong addr, uint32_t desc) \ 6107 { \ 6108 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6109 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6110 } \ 6111 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 6112 target_ulong addr, uint32_t desc) \ 6113 { \ 6114 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6115 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6116 } \ 6117 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 6118 target_ulong addr, uint32_t desc) \ 6119 { \ 6120 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6121 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6122 } \ 6123 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 6124 target_ulong addr, uint32_t desc) \ 6125 { \ 6126 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6127 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6128 } 6129 6130 DO_LDN_1(2) 6131 DO_LDN_1(3) 6132 DO_LDN_1(4) 6133 6134 DO_LDN_2(2, hh, MO_16) 6135 DO_LDN_2(3, hh, MO_16) 6136 DO_LDN_2(4, hh, MO_16) 6137 6138 DO_LDN_2(2, ss, MO_32) 6139 DO_LDN_2(3, ss, MO_32) 6140 DO_LDN_2(4, ss, MO_32) 6141 6142 DO_LDN_2(2, dd, MO_64) 6143 DO_LDN_2(3, dd, MO_64) 6144 DO_LDN_2(4, dd, MO_64) 6145 6146 #undef DO_LDN_1 6147 #undef DO_LDN_2 6148 6149 /* 6150 * Load contiguous data, first-fault and no-fault. 6151 * 6152 * For user-only, we control the race between page_check_range and 6153 * another thread's munmap by using set/clear_helper_retaddr. Any 6154 * SEGV that occurs between those markers is assumed to be because 6155 * the guest page vanished. Keep that block as small as possible 6156 * so that unrelated QEMU bugs are not blamed on the guest. 6157 */ 6158 6159 /* Fault on byte I. All bits in FFR from I are cleared. The vector 6160 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 6161 * option, which leaves subsequent data unchanged. 6162 */ 6163 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 6164 { 6165 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 6166 6167 if (i & 63) { 6168 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 6169 i = ROUND_UP(i, 64); 6170 } 6171 for (; i < oprsz; i += 64) { 6172 ffr[i / 64] = 0; 6173 } 6174 } 6175 6176 /* 6177 * Common helper for all contiguous no-fault and first-fault loads. 6178 */ 6179 static inline QEMU_ALWAYS_INLINE 6180 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 6181 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 6182 const int esz, const int msz, const SVEContFault fault, 6183 sve_ldst1_host_fn *host_fn, 6184 sve_ldst1_tlb_fn *tlb_fn) 6185 { 6186 const unsigned rd = simd_data(desc); 6187 void *vd = &env->vfp.zregs[rd]; 6188 const intptr_t reg_max = simd_oprsz(desc); 6189 intptr_t reg_off, mem_off, reg_last; 6190 SVEContLdSt info; 6191 int flags; 6192 void *host; 6193 6194 /* Find the active elements. */ 6195 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 6196 /* The entire predicate was false; no load occurs. */ 6197 memset(vd, 0, reg_max); 6198 return; 6199 } 6200 reg_off = info.reg_off_first[0]; 6201 6202 /* Probe the page(s). */ 6203 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6204 /* Fault on first element. */ 6205 tcg_debug_assert(fault == FAULT_NO); 6206 memset(vd, 0, reg_max); 6207 goto do_fault; 6208 } 6209 6210 mem_off = info.mem_off_first[0]; 6211 flags = info.page[0].flags; 6212 6213 /* 6214 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6215 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6216 */ 6217 if (!info.page[0].tagged) { 6218 mtedesc = 0; 6219 } 6220 6221 if (fault == FAULT_FIRST) { 6222 /* Trapping mte check for the first-fault element. */ 6223 if (mtedesc) { 6224 mte_check(env, mtedesc, addr + mem_off, retaddr); 6225 } 6226 6227 /* 6228 * Special handling of the first active element, 6229 * if it crosses a page boundary or is MMIO. 6230 */ 6231 bool is_split = mem_off == info.mem_off_split; 6232 if (unlikely(flags != 0) || unlikely(is_split)) { 6233 /* 6234 * Use the slow path for cross-page handling. 6235 * Might trap for MMIO or watchpoints. 6236 */ 6237 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6238 6239 /* After any fault, zero the other elements. */ 6240 swap_memzero(vd, reg_off); 6241 reg_off += 1 << esz; 6242 mem_off += 1 << msz; 6243 swap_memzero(vd + reg_off, reg_max - reg_off); 6244 6245 if (is_split) { 6246 goto second_page; 6247 } 6248 } else { 6249 memset(vd, 0, reg_max); 6250 } 6251 } else { 6252 memset(vd, 0, reg_max); 6253 if (unlikely(mem_off == info.mem_off_split)) { 6254 /* The first active element crosses a page boundary. */ 6255 flags |= info.page[1].flags; 6256 if (unlikely(flags & TLB_MMIO)) { 6257 /* Some page is MMIO, see below. */ 6258 goto do_fault; 6259 } 6260 if (unlikely(flags & TLB_WATCHPOINT) && 6261 (cpu_watchpoint_address_matches 6262 (env_cpu(env), addr + mem_off, 1 << msz) 6263 & BP_MEM_READ)) { 6264 /* Watchpoint hit, see below. */ 6265 goto do_fault; 6266 } 6267 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6268 goto do_fault; 6269 } 6270 /* 6271 * Use the slow path for cross-page handling. 6272 * This is RAM, without a watchpoint, and will not trap. 6273 */ 6274 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6275 goto second_page; 6276 } 6277 } 6278 6279 /* 6280 * From this point on, all memory operations are MemSingleNF. 6281 * 6282 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6283 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6284 * 6285 * Unfortuately we do not have access to the memory attributes from the 6286 * PTE to tell Device memory from Normal memory. So we make a mostly 6287 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6288 * This gives the right answer for the common cases of "Normal memory, 6289 * backed by host RAM" and "Device memory, backed by MMIO". 6290 * The architecture allows us to suppress an NF load and return 6291 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6292 * case of "Normal memory, backed by MMIO" is permitted. The case we 6293 * get wrong is "Device memory, backed by host RAM", for which we 6294 * should return (UNKNOWN, FAULT) for but do not. 6295 * 6296 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6297 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6298 * architectural breakpoints the same. 6299 */ 6300 if (unlikely(flags & TLB_MMIO)) { 6301 goto do_fault; 6302 } 6303 6304 reg_last = info.reg_off_last[0]; 6305 host = info.page[0].host; 6306 6307 set_helper_retaddr(retaddr); 6308 6309 do { 6310 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6311 do { 6312 if ((pg >> (reg_off & 63)) & 1) { 6313 if (unlikely(flags & TLB_WATCHPOINT) && 6314 (cpu_watchpoint_address_matches 6315 (env_cpu(env), addr + mem_off, 1 << msz) 6316 & BP_MEM_READ)) { 6317 clear_helper_retaddr(); 6318 goto do_fault; 6319 } 6320 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6321 clear_helper_retaddr(); 6322 goto do_fault; 6323 } 6324 host_fn(vd, reg_off, host + mem_off); 6325 } 6326 reg_off += 1 << esz; 6327 mem_off += 1 << msz; 6328 } while (reg_off <= reg_last && (reg_off & 63)); 6329 } while (reg_off <= reg_last); 6330 6331 clear_helper_retaddr(); 6332 6333 /* 6334 * MemSingleNF is allowed to fail for any reason. We have special 6335 * code above to handle the first element crossing a page boundary. 6336 * As an implementation choice, decline to handle a cross-page element 6337 * in any other position. 6338 */ 6339 reg_off = info.reg_off_split; 6340 if (reg_off >= 0) { 6341 goto do_fault; 6342 } 6343 6344 second_page: 6345 reg_off = info.reg_off_first[1]; 6346 if (likely(reg_off < 0)) { 6347 /* No active elements on the second page. All done. */ 6348 return; 6349 } 6350 6351 /* 6352 * MemSingleNF is allowed to fail for any reason. As an implementation 6353 * choice, decline to handle elements on the second page. This should 6354 * be low frequency as the guest walks through memory -- the next 6355 * iteration of the guest's loop should be aligned on the page boundary, 6356 * and then all following iterations will stay aligned. 6357 */ 6358 6359 do_fault: 6360 record_fault(env, reg_off, reg_max); 6361 } 6362 6363 static inline QEMU_ALWAYS_INLINE 6364 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6365 uint32_t desc, const uintptr_t retaddr, 6366 const int esz, const int msz, const SVEContFault fault, 6367 sve_ldst1_host_fn *host_fn, 6368 sve_ldst1_tlb_fn *tlb_fn) 6369 { 6370 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6371 int bit55 = extract64(addr, 55, 1); 6372 6373 /* Remove mtedesc from the normal sve descriptor. */ 6374 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6375 6376 /* Perform gross MTE suppression early. */ 6377 if (!tbi_check(mtedesc, bit55) || 6378 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6379 mtedesc = 0; 6380 } 6381 6382 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6383 esz, msz, fault, host_fn, tlb_fn); 6384 } 6385 6386 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6387 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6388 target_ulong addr, uint32_t desc) \ 6389 { \ 6390 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6391 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6392 } \ 6393 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6394 target_ulong addr, uint32_t desc) \ 6395 { \ 6396 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6397 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6398 } \ 6399 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6400 target_ulong addr, uint32_t desc) \ 6401 { \ 6402 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6403 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6404 } \ 6405 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6406 target_ulong addr, uint32_t desc) \ 6407 { \ 6408 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6409 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6410 } 6411 6412 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6413 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6414 target_ulong addr, uint32_t desc) \ 6415 { \ 6416 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6417 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6418 } \ 6419 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6420 target_ulong addr, uint32_t desc) \ 6421 { \ 6422 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6423 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6424 } \ 6425 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6426 target_ulong addr, uint32_t desc) \ 6427 { \ 6428 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6429 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6430 } \ 6431 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6432 target_ulong addr, uint32_t desc) \ 6433 { \ 6434 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6435 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6436 } \ 6437 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6438 target_ulong addr, uint32_t desc) \ 6439 { \ 6440 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6441 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6442 } \ 6443 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6444 target_ulong addr, uint32_t desc) \ 6445 { \ 6446 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6447 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6448 } \ 6449 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6450 target_ulong addr, uint32_t desc) \ 6451 { \ 6452 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6453 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6454 } \ 6455 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6456 target_ulong addr, uint32_t desc) \ 6457 { \ 6458 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6459 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6460 } 6461 6462 DO_LDFF1_LDNF1_1(bb, MO_8) 6463 DO_LDFF1_LDNF1_1(bhu, MO_16) 6464 DO_LDFF1_LDNF1_1(bhs, MO_16) 6465 DO_LDFF1_LDNF1_1(bsu, MO_32) 6466 DO_LDFF1_LDNF1_1(bss, MO_32) 6467 DO_LDFF1_LDNF1_1(bdu, MO_64) 6468 DO_LDFF1_LDNF1_1(bds, MO_64) 6469 6470 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6471 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6472 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6473 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6474 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6475 6476 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6477 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6478 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6479 6480 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6481 6482 #undef DO_LDFF1_LDNF1_1 6483 #undef DO_LDFF1_LDNF1_2 6484 6485 /* 6486 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6487 */ 6488 6489 static inline QEMU_ALWAYS_INLINE 6490 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6491 uint32_t desc, const uintptr_t retaddr, 6492 const int esz, const int msz, const int N, uint32_t mtedesc, 6493 sve_ldst1_host_fn *host_fn, 6494 sve_ldst1_tlb_fn *tlb_fn) 6495 { 6496 const unsigned rd = simd_data(desc); 6497 const intptr_t reg_max = simd_oprsz(desc); 6498 intptr_t reg_off, reg_last, mem_off; 6499 SVEContLdSt info; 6500 void *host; 6501 int i, flags; 6502 6503 /* Find the active elements. */ 6504 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6505 /* The entire predicate was false; no store occurs. */ 6506 return; 6507 } 6508 6509 /* Probe the page(s). Exit with exception for any invalid page. */ 6510 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6511 6512 /* Handle watchpoints for all active elements. */ 6513 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6514 BP_MEM_WRITE, retaddr); 6515 6516 /* 6517 * Handle mte checks for all active elements. 6518 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6519 */ 6520 if (mtedesc) { 6521 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6522 mtedesc, retaddr); 6523 } 6524 6525 flags = info.page[0].flags | info.page[1].flags; 6526 if (unlikely(flags != 0)) { 6527 /* 6528 * At least one page includes MMIO. 6529 * Any bus operation can fail with cpu_transaction_failed, 6530 * which for ARM will raise SyncExternal. We cannot avoid 6531 * this fault and will leave with the store incomplete. 6532 */ 6533 mem_off = info.mem_off_first[0]; 6534 reg_off = info.reg_off_first[0]; 6535 reg_last = info.reg_off_last[1]; 6536 if (reg_last < 0) { 6537 reg_last = info.reg_off_split; 6538 if (reg_last < 0) { 6539 reg_last = info.reg_off_last[0]; 6540 } 6541 } 6542 6543 do { 6544 uint64_t pg = vg[reg_off >> 6]; 6545 do { 6546 if ((pg >> (reg_off & 63)) & 1) { 6547 for (i = 0; i < N; ++i) { 6548 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6549 addr + mem_off + (i << msz), retaddr); 6550 } 6551 } 6552 reg_off += 1 << esz; 6553 mem_off += N << msz; 6554 } while (reg_off & 63); 6555 } while (reg_off <= reg_last); 6556 return; 6557 } 6558 6559 mem_off = info.mem_off_first[0]; 6560 reg_off = info.reg_off_first[0]; 6561 reg_last = info.reg_off_last[0]; 6562 host = info.page[0].host; 6563 6564 set_helper_retaddr(retaddr); 6565 6566 while (reg_off <= reg_last) { 6567 uint64_t pg = vg[reg_off >> 6]; 6568 do { 6569 if ((pg >> (reg_off & 63)) & 1) { 6570 for (i = 0; i < N; ++i) { 6571 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6572 host + mem_off + (i << msz)); 6573 } 6574 } 6575 reg_off += 1 << esz; 6576 mem_off += N << msz; 6577 } while (reg_off <= reg_last && (reg_off & 63)); 6578 } 6579 6580 clear_helper_retaddr(); 6581 6582 /* 6583 * Use the slow path to manage the cross-page misalignment. 6584 * But we know this is RAM and cannot trap. 6585 */ 6586 mem_off = info.mem_off_split; 6587 if (unlikely(mem_off >= 0)) { 6588 reg_off = info.reg_off_split; 6589 for (i = 0; i < N; ++i) { 6590 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6591 addr + mem_off + (i << msz), retaddr); 6592 } 6593 } 6594 6595 mem_off = info.mem_off_first[1]; 6596 if (unlikely(mem_off >= 0)) { 6597 reg_off = info.reg_off_first[1]; 6598 reg_last = info.reg_off_last[1]; 6599 host = info.page[1].host; 6600 6601 set_helper_retaddr(retaddr); 6602 6603 do { 6604 uint64_t pg = vg[reg_off >> 6]; 6605 do { 6606 if ((pg >> (reg_off & 63)) & 1) { 6607 for (i = 0; i < N; ++i) { 6608 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6609 host + mem_off + (i << msz)); 6610 } 6611 } 6612 reg_off += 1 << esz; 6613 mem_off += N << msz; 6614 } while (reg_off & 63); 6615 } while (reg_off <= reg_last); 6616 6617 clear_helper_retaddr(); 6618 } 6619 } 6620 6621 static inline QEMU_ALWAYS_INLINE 6622 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6623 uint32_t desc, const uintptr_t ra, 6624 const int esz, const int msz, const int N, 6625 sve_ldst1_host_fn *host_fn, 6626 sve_ldst1_tlb_fn *tlb_fn) 6627 { 6628 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6629 int bit55 = extract64(addr, 55, 1); 6630 6631 /* Remove mtedesc from the normal sve descriptor. */ 6632 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6633 6634 /* Perform gross MTE suppression early. */ 6635 if (!tbi_check(mtedesc, bit55) || 6636 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6637 mtedesc = 0; 6638 } 6639 6640 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6641 } 6642 6643 #define DO_STN_1(N, NAME, ESZ) \ 6644 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6645 target_ulong addr, uint32_t desc) \ 6646 { \ 6647 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6648 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6649 } \ 6650 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6651 target_ulong addr, uint32_t desc) \ 6652 { \ 6653 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6654 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6655 } 6656 6657 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6658 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6659 target_ulong addr, uint32_t desc) \ 6660 { \ 6661 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6662 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6663 } \ 6664 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6665 target_ulong addr, uint32_t desc) \ 6666 { \ 6667 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6668 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6669 } \ 6670 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6671 target_ulong addr, uint32_t desc) \ 6672 { \ 6673 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6674 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6675 } \ 6676 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6677 target_ulong addr, uint32_t desc) \ 6678 { \ 6679 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6680 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6681 } 6682 6683 DO_STN_1(1, bb, MO_8) 6684 DO_STN_1(1, bh, MO_16) 6685 DO_STN_1(1, bs, MO_32) 6686 DO_STN_1(1, bd, MO_64) 6687 DO_STN_1(2, bb, MO_8) 6688 DO_STN_1(3, bb, MO_8) 6689 DO_STN_1(4, bb, MO_8) 6690 6691 DO_STN_2(1, hh, MO_16, MO_16) 6692 DO_STN_2(1, hs, MO_32, MO_16) 6693 DO_STN_2(1, hd, MO_64, MO_16) 6694 DO_STN_2(2, hh, MO_16, MO_16) 6695 DO_STN_2(3, hh, MO_16, MO_16) 6696 DO_STN_2(4, hh, MO_16, MO_16) 6697 6698 DO_STN_2(1, ss, MO_32, MO_32) 6699 DO_STN_2(1, sd, MO_64, MO_32) 6700 DO_STN_2(2, ss, MO_32, MO_32) 6701 DO_STN_2(3, ss, MO_32, MO_32) 6702 DO_STN_2(4, ss, MO_32, MO_32) 6703 6704 DO_STN_2(1, dd, MO_64, MO_64) 6705 DO_STN_2(2, dd, MO_64, MO_64) 6706 DO_STN_2(3, dd, MO_64, MO_64) 6707 DO_STN_2(4, dd, MO_64, MO_64) 6708 6709 #undef DO_STN_1 6710 #undef DO_STN_2 6711 6712 /* 6713 * Loads with a vector index. 6714 */ 6715 6716 /* 6717 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6718 */ 6719 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6720 6721 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6722 { 6723 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6724 } 6725 6726 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6727 { 6728 return *(int32_t *)(reg + H1_4(reg_ofs)); 6729 } 6730 6731 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6732 { 6733 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6734 } 6735 6736 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6737 { 6738 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6739 } 6740 6741 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6742 { 6743 return *(uint64_t *)(reg + reg_ofs); 6744 } 6745 6746 static inline QEMU_ALWAYS_INLINE 6747 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6748 target_ulong base, uint32_t desc, uintptr_t retaddr, 6749 uint32_t mtedesc, int esize, int msize, 6750 zreg_off_fn *off_fn, 6751 sve_ldst1_host_fn *host_fn, 6752 sve_ldst1_tlb_fn *tlb_fn) 6753 { 6754 const int mmu_idx = arm_env_mmu_index(env); 6755 const intptr_t reg_max = simd_oprsz(desc); 6756 const int scale = simd_data(desc); 6757 ARMVectorReg scratch; 6758 intptr_t reg_off; 6759 SVEHostPage info, info2; 6760 6761 memset(&scratch, 0, reg_max); 6762 reg_off = 0; 6763 do { 6764 uint64_t pg = vg[reg_off >> 6]; 6765 do { 6766 if (likely(pg & 1)) { 6767 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6768 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6769 6770 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6771 mmu_idx, retaddr); 6772 6773 if (likely(in_page >= msize)) { 6774 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6775 cpu_check_watchpoint(env_cpu(env), addr, msize, 6776 info.attrs, BP_MEM_READ, retaddr); 6777 } 6778 if (mtedesc && info.tagged) { 6779 mte_check(env, mtedesc, addr, retaddr); 6780 } 6781 if (unlikely(info.flags & TLB_MMIO)) { 6782 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6783 } else { 6784 set_helper_retaddr(retaddr); 6785 host_fn(&scratch, reg_off, info.host); 6786 clear_helper_retaddr(); 6787 } 6788 } else { 6789 /* Element crosses the page boundary. */ 6790 sve_probe_page(&info2, false, env, addr + in_page, 0, 6791 MMU_DATA_LOAD, mmu_idx, retaddr); 6792 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6793 cpu_check_watchpoint(env_cpu(env), addr, 6794 msize, info.attrs, 6795 BP_MEM_READ, retaddr); 6796 } 6797 if (mtedesc && info.tagged) { 6798 mte_check(env, mtedesc, addr, retaddr); 6799 } 6800 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6801 } 6802 } 6803 reg_off += esize; 6804 pg >>= esize; 6805 } while (reg_off & 63); 6806 } while (reg_off < reg_max); 6807 6808 /* Wait until all exceptions have been raised to write back. */ 6809 memcpy(vd, &scratch, reg_max); 6810 } 6811 6812 static inline QEMU_ALWAYS_INLINE 6813 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6814 target_ulong base, uint32_t desc, uintptr_t retaddr, 6815 int esize, int msize, zreg_off_fn *off_fn, 6816 sve_ldst1_host_fn *host_fn, 6817 sve_ldst1_tlb_fn *tlb_fn) 6818 { 6819 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6820 /* Remove mtedesc from the normal sve descriptor. */ 6821 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6822 6823 /* 6824 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6825 * offset base entirely over the address space hole to change the 6826 * pointer tag, or change the bit55 selector. So we could here 6827 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6828 */ 6829 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6830 esize, msize, off_fn, host_fn, tlb_fn); 6831 } 6832 6833 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6834 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6835 void *vm, target_ulong base, uint32_t desc) \ 6836 { \ 6837 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6838 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6839 } \ 6840 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6841 void *vm, target_ulong base, uint32_t desc) \ 6842 { \ 6843 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6844 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6845 } 6846 6847 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6848 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6849 void *vm, target_ulong base, uint32_t desc) \ 6850 { \ 6851 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6852 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6853 } \ 6854 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6855 void *vm, target_ulong base, uint32_t desc) \ 6856 { \ 6857 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6858 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6859 } 6860 6861 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6862 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6863 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6864 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6865 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6866 6867 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6868 DO_LD1_ZPZ_S(bss, zss, MO_8) 6869 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6870 DO_LD1_ZPZ_D(bds, zss, MO_8) 6871 DO_LD1_ZPZ_D(bds, zd, MO_8) 6872 6873 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6874 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6875 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6876 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6877 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6878 6879 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6880 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6881 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6882 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6883 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6884 6885 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6886 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6887 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6888 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6889 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6890 6891 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6892 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6893 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6894 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6895 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6896 6897 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6898 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6899 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6900 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6901 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6902 6903 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6904 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6905 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6906 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6907 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6908 6909 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6910 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6911 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6912 6913 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6914 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6915 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6916 6917 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6918 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6919 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6920 6921 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6922 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6923 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6924 6925 #undef DO_LD1_ZPZ_S 6926 #undef DO_LD1_ZPZ_D 6927 6928 /* First fault loads with a vector index. */ 6929 6930 /* 6931 * Common helpers for all gather first-faulting loads. 6932 */ 6933 6934 static inline QEMU_ALWAYS_INLINE 6935 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6936 target_ulong base, uint32_t desc, uintptr_t retaddr, 6937 uint32_t mtedesc, const int esz, const int msz, 6938 zreg_off_fn *off_fn, 6939 sve_ldst1_host_fn *host_fn, 6940 sve_ldst1_tlb_fn *tlb_fn) 6941 { 6942 const int mmu_idx = arm_env_mmu_index(env); 6943 const intptr_t reg_max = simd_oprsz(desc); 6944 const int scale = simd_data(desc); 6945 const int esize = 1 << esz; 6946 const int msize = 1 << msz; 6947 intptr_t reg_off; 6948 SVEHostPage info; 6949 target_ulong addr, in_page; 6950 ARMVectorReg scratch; 6951 6952 /* Skip to the first true predicate. */ 6953 reg_off = find_next_active(vg, 0, reg_max, esz); 6954 if (unlikely(reg_off >= reg_max)) { 6955 /* The entire predicate was false; no load occurs. */ 6956 memset(vd, 0, reg_max); 6957 return; 6958 } 6959 6960 /* Protect against overlap between vd and vm. */ 6961 if (unlikely(vd == vm)) { 6962 vm = memcpy(&scratch, vm, reg_max); 6963 } 6964 6965 /* 6966 * Probe the first element, allowing faults. 6967 */ 6968 addr = base + (off_fn(vm, reg_off) << scale); 6969 if (mtedesc) { 6970 mte_check(env, mtedesc, addr, retaddr); 6971 } 6972 tlb_fn(env, vd, reg_off, addr, retaddr); 6973 6974 /* After any fault, zero the other elements. */ 6975 swap_memzero(vd, reg_off); 6976 reg_off += esize; 6977 swap_memzero(vd + reg_off, reg_max - reg_off); 6978 6979 /* 6980 * Probe the remaining elements, not allowing faults. 6981 */ 6982 while (reg_off < reg_max) { 6983 uint64_t pg = vg[reg_off >> 6]; 6984 do { 6985 if (likely((pg >> (reg_off & 63)) & 1)) { 6986 addr = base + (off_fn(vm, reg_off) << scale); 6987 in_page = -(addr | TARGET_PAGE_MASK); 6988 6989 if (unlikely(in_page < msize)) { 6990 /* Stop if the element crosses a page boundary. */ 6991 goto fault; 6992 } 6993 6994 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6995 mmu_idx, retaddr); 6996 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6997 goto fault; 6998 } 6999 if (unlikely(info.flags & TLB_WATCHPOINT) && 7000 (cpu_watchpoint_address_matches 7001 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 7002 goto fault; 7003 } 7004 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 7005 goto fault; 7006 } 7007 7008 set_helper_retaddr(retaddr); 7009 host_fn(vd, reg_off, info.host); 7010 clear_helper_retaddr(); 7011 } 7012 reg_off += esize; 7013 } while (reg_off & 63); 7014 } 7015 return; 7016 7017 fault: 7018 record_fault(env, reg_off, reg_max); 7019 } 7020 7021 static inline QEMU_ALWAYS_INLINE 7022 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7023 target_ulong base, uint32_t desc, uintptr_t retaddr, 7024 const int esz, const int msz, 7025 zreg_off_fn *off_fn, 7026 sve_ldst1_host_fn *host_fn, 7027 sve_ldst1_tlb_fn *tlb_fn) 7028 { 7029 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7030 /* Remove mtedesc from the normal sve descriptor. */ 7031 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7032 7033 /* 7034 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7035 * offset base entirely over the address space hole to change the 7036 * pointer tag, or change the bit55 selector. So we could here 7037 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7038 */ 7039 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7040 esz, msz, off_fn, host_fn, tlb_fn); 7041 } 7042 7043 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 7044 void HELPER(sve_ldff##MEM##_##OFS) \ 7045 (CPUARMState *env, void *vd, void *vg, \ 7046 void *vm, target_ulong base, uint32_t desc) \ 7047 { \ 7048 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 7049 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7050 } \ 7051 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7052 (CPUARMState *env, void *vd, void *vg, \ 7053 void *vm, target_ulong base, uint32_t desc) \ 7054 { \ 7055 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 7056 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7057 } 7058 7059 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 7060 void HELPER(sve_ldff##MEM##_##OFS) \ 7061 (CPUARMState *env, void *vd, void *vg, \ 7062 void *vm, target_ulong base, uint32_t desc) \ 7063 { \ 7064 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 7065 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7066 } \ 7067 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7068 (CPUARMState *env, void *vd, void *vg, \ 7069 void *vm, target_ulong base, uint32_t desc) \ 7070 { \ 7071 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 7072 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7073 } 7074 7075 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 7076 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 7077 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 7078 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 7079 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 7080 7081 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 7082 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 7083 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 7084 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 7085 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 7086 7087 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 7088 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 7089 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 7090 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 7091 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 7092 7093 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 7094 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 7095 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 7096 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 7097 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 7098 7099 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 7100 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 7101 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 7102 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 7103 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 7104 7105 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 7106 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 7107 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 7108 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 7109 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 7110 7111 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 7112 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 7113 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 7114 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 7115 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 7116 7117 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 7118 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 7119 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 7120 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 7121 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 7122 7123 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 7124 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 7125 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 7126 7127 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 7128 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 7129 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 7130 7131 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 7132 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 7133 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 7134 7135 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 7136 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 7137 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 7138 7139 /* Stores with a vector index. */ 7140 7141 static inline QEMU_ALWAYS_INLINE 7142 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7143 target_ulong base, uint32_t desc, uintptr_t retaddr, 7144 uint32_t mtedesc, int esize, int msize, 7145 zreg_off_fn *off_fn, 7146 sve_ldst1_host_fn *host_fn, 7147 sve_ldst1_tlb_fn *tlb_fn) 7148 { 7149 const int mmu_idx = arm_env_mmu_index(env); 7150 const intptr_t reg_max = simd_oprsz(desc); 7151 const int scale = simd_data(desc); 7152 void *host[ARM_MAX_VQ * 4]; 7153 intptr_t reg_off, i; 7154 SVEHostPage info, info2; 7155 7156 /* 7157 * Probe all of the elements for host addresses and flags. 7158 */ 7159 i = reg_off = 0; 7160 do { 7161 uint64_t pg = vg[reg_off >> 6]; 7162 do { 7163 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7164 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7165 7166 host[i] = NULL; 7167 if (likely((pg >> (reg_off & 63)) & 1)) { 7168 if (likely(in_page >= msize)) { 7169 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 7170 mmu_idx, retaddr); 7171 if (!(info.flags & TLB_MMIO)) { 7172 host[i] = info.host; 7173 } 7174 } else { 7175 /* 7176 * Element crosses the page boundary. 7177 * Probe both pages, but do not record the host address, 7178 * so that we use the slow path. 7179 */ 7180 sve_probe_page(&info, false, env, addr, 0, 7181 MMU_DATA_STORE, mmu_idx, retaddr); 7182 sve_probe_page(&info2, false, env, addr + in_page, 0, 7183 MMU_DATA_STORE, mmu_idx, retaddr); 7184 info.flags |= info2.flags; 7185 } 7186 7187 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7188 cpu_check_watchpoint(env_cpu(env), addr, msize, 7189 info.attrs, BP_MEM_WRITE, retaddr); 7190 } 7191 7192 if (mtedesc && info.tagged) { 7193 mte_check(env, mtedesc, addr, retaddr); 7194 } 7195 } 7196 i += 1; 7197 reg_off += esize; 7198 } while (reg_off & 63); 7199 } while (reg_off < reg_max); 7200 7201 /* 7202 * Now that we have recognized all exceptions except SyncExternal 7203 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7204 * 7205 * Note for the common case of an element in RAM, not crossing a page 7206 * boundary, we have stored the host address in host[]. This doubles 7207 * as a first-level check against the predicate, since only enabled 7208 * elements have non-null host addresses. 7209 */ 7210 i = reg_off = 0; 7211 do { 7212 void *h = host[i]; 7213 if (likely(h != NULL)) { 7214 set_helper_retaddr(retaddr); 7215 host_fn(vd, reg_off, h); 7216 clear_helper_retaddr(); 7217 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7218 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7219 tlb_fn(env, vd, reg_off, addr, retaddr); 7220 } 7221 i += 1; 7222 reg_off += esize; 7223 } while (reg_off < reg_max); 7224 } 7225 7226 static inline QEMU_ALWAYS_INLINE 7227 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7228 target_ulong base, uint32_t desc, uintptr_t retaddr, 7229 int esize, int msize, zreg_off_fn *off_fn, 7230 sve_ldst1_host_fn *host_fn, 7231 sve_ldst1_tlb_fn *tlb_fn) 7232 { 7233 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7234 /* Remove mtedesc from the normal sve descriptor. */ 7235 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7236 7237 /* 7238 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7239 * offset base entirely over the address space hole to change the 7240 * pointer tag, or change the bit55 selector. So we could here 7241 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7242 */ 7243 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7244 esize, msize, off_fn, host_fn, tlb_fn); 7245 } 7246 7247 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7248 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7249 void *vm, target_ulong base, uint32_t desc) \ 7250 { \ 7251 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7252 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7253 } \ 7254 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7255 void *vm, target_ulong base, uint32_t desc) \ 7256 { \ 7257 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7258 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7259 } 7260 7261 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7262 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7263 void *vm, target_ulong base, uint32_t desc) \ 7264 { \ 7265 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7266 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7267 } \ 7268 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7269 void *vm, target_ulong base, uint32_t desc) \ 7270 { \ 7271 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7272 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7273 } 7274 7275 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7276 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7277 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7278 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7279 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7280 7281 DO_ST1_ZPZ_S(bs, zss, MO_8) 7282 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7283 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7284 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7285 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7286 7287 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7288 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7289 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7290 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7291 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7292 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7293 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7294 7295 DO_ST1_ZPZ_D(bd, zss, MO_8) 7296 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7297 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7298 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7299 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7300 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7301 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7302 7303 DO_ST1_ZPZ_D(bd, zd, MO_8) 7304 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7305 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7306 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7307 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7308 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7309 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7310 7311 #undef DO_ST1_ZPZ_S 7312 #undef DO_ST1_ZPZ_D 7313 7314 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7315 { 7316 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7317 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7318 7319 for (i = 0; i < opr_sz; ++i) { 7320 d[i] = n[i] ^ m[i] ^ k[i]; 7321 } 7322 } 7323 7324 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7325 { 7326 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7327 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7328 7329 for (i = 0; i < opr_sz; ++i) { 7330 d[i] = n[i] ^ (m[i] & ~k[i]); 7331 } 7332 } 7333 7334 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7335 { 7336 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7337 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7338 7339 for (i = 0; i < opr_sz; ++i) { 7340 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7341 } 7342 } 7343 7344 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7345 { 7346 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7347 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7348 7349 for (i = 0; i < opr_sz; ++i) { 7350 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7351 } 7352 } 7353 7354 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7355 { 7356 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7357 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7358 7359 for (i = 0; i < opr_sz; ++i) { 7360 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7361 } 7362 } 7363 7364 /* 7365 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7366 * See hasless(v,1) from 7367 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7368 */ 7369 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7370 { 7371 int bits = 8 << esz; 7372 uint64_t ones = dup_const(esz, 1); 7373 uint64_t signs = ones << (bits - 1); 7374 uint64_t cmp0, cmp1; 7375 7376 cmp1 = dup_const(esz, n); 7377 cmp0 = cmp1 ^ m0; 7378 cmp1 = cmp1 ^ m1; 7379 cmp0 = (cmp0 - ones) & ~cmp0; 7380 cmp1 = (cmp1 - ones) & ~cmp1; 7381 return (cmp0 | cmp1) & signs; 7382 } 7383 7384 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7385 uint32_t desc, int esz, bool nmatch) 7386 { 7387 uint16_t esz_mask = pred_esz_masks[esz]; 7388 intptr_t opr_sz = simd_oprsz(desc); 7389 uint32_t flags = PREDTEST_INIT; 7390 intptr_t i, j, k; 7391 7392 for (i = 0; i < opr_sz; i += 16) { 7393 uint64_t m0 = *(uint64_t *)(vm + i); 7394 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7395 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7396 uint16_t out = 0; 7397 7398 for (j = 0; j < 16; j += 8) { 7399 uint64_t n = *(uint64_t *)(vn + i + j); 7400 7401 for (k = 0; k < 8; k += 1 << esz) { 7402 if (pg & (1 << (j + k))) { 7403 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7404 out |= (o ^ nmatch) << (j + k); 7405 } 7406 } 7407 } 7408 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7409 flags = iter_predtest_fwd(out, pg, flags); 7410 } 7411 return flags; 7412 } 7413 7414 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7415 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7416 { \ 7417 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7418 } 7419 7420 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7421 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7422 7423 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7424 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7425 7426 #undef DO_PPZZ_MATCH 7427 7428 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7429 uint32_t desc) 7430 { 7431 ARMVectorReg scratch; 7432 intptr_t i, j; 7433 intptr_t opr_sz = simd_oprsz(desc); 7434 uint32_t *d = vd, *n = vn, *m = vm; 7435 uint8_t *pg = vg; 7436 7437 if (d == n) { 7438 n = memcpy(&scratch, n, opr_sz); 7439 if (d == m) { 7440 m = n; 7441 } 7442 } else if (d == m) { 7443 m = memcpy(&scratch, m, opr_sz); 7444 } 7445 7446 for (i = 0; i < opr_sz; i += 4) { 7447 uint64_t count = 0; 7448 uint8_t pred; 7449 7450 pred = pg[H1(i >> 3)] >> (i & 7); 7451 if (pred & 1) { 7452 uint32_t nn = n[H4(i >> 2)]; 7453 7454 for (j = 0; j <= i; j += 4) { 7455 pred = pg[H1(j >> 3)] >> (j & 7); 7456 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7457 ++count; 7458 } 7459 } 7460 } 7461 d[H4(i >> 2)] = count; 7462 } 7463 } 7464 7465 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7466 uint32_t desc) 7467 { 7468 ARMVectorReg scratch; 7469 intptr_t i, j; 7470 intptr_t opr_sz = simd_oprsz(desc); 7471 uint64_t *d = vd, *n = vn, *m = vm; 7472 uint8_t *pg = vg; 7473 7474 if (d == n) { 7475 n = memcpy(&scratch, n, opr_sz); 7476 if (d == m) { 7477 m = n; 7478 } 7479 } else if (d == m) { 7480 m = memcpy(&scratch, m, opr_sz); 7481 } 7482 7483 for (i = 0; i < opr_sz / 8; ++i) { 7484 uint64_t count = 0; 7485 if (pg[H1(i)] & 1) { 7486 uint64_t nn = n[i]; 7487 for (j = 0; j <= i; ++j) { 7488 if ((pg[H1(j)] & 1) && nn == m[j]) { 7489 ++count; 7490 } 7491 } 7492 } 7493 d[i] = count; 7494 } 7495 } 7496 7497 /* 7498 * Returns the number of bytes in m0 and m1 that match n. 7499 * Unlike do_match2 we don't just need true/false, we need an exact count. 7500 * This requires two extra logical operations. 7501 */ 7502 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7503 { 7504 const uint64_t mask = dup_const(MO_8, 0x7f); 7505 uint64_t cmp0, cmp1; 7506 7507 cmp1 = dup_const(MO_8, n); 7508 cmp0 = cmp1 ^ m0; 7509 cmp1 = cmp1 ^ m1; 7510 7511 /* 7512 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7513 * 2: carry in to msb if byte != 0 (+ mask) 7514 * 3: set msb if cmp has msb set (| cmp) 7515 * 4: set ~msb to ignore them (| mask) 7516 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7517 * 5: invert, resulting in 0x80 if and only if byte == 0. 7518 */ 7519 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7520 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7521 7522 /* 7523 * Combine the two compares in a way that the bits do 7524 * not overlap, and so preserves the count of set bits. 7525 * If the host has an efficient instruction for ctpop, 7526 * then ctpop(x) + ctpop(y) has the same number of 7527 * operations as ctpop(x | (y >> 1)). If the host does 7528 * not have an efficient ctpop, then we only want to 7529 * use it once. 7530 */ 7531 return ctpop64(cmp0 | (cmp1 >> 1)); 7532 } 7533 7534 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7535 { 7536 intptr_t i, j; 7537 intptr_t opr_sz = simd_oprsz(desc); 7538 7539 for (i = 0; i < opr_sz; i += 16) { 7540 uint64_t n0 = *(uint64_t *)(vn + i); 7541 uint64_t m0 = *(uint64_t *)(vm + i); 7542 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7543 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7544 uint64_t out0 = 0; 7545 uint64_t out1 = 0; 7546 7547 for (j = 0; j < 64; j += 8) { 7548 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7549 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7550 out0 |= cnt0 << j; 7551 out1 |= cnt1 << j; 7552 } 7553 7554 *(uint64_t *)(vd + i) = out0; 7555 *(uint64_t *)(vd + i + 8) = out1; 7556 } 7557 } 7558 7559 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7560 { 7561 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7562 int shr = simd_data(desc); 7563 int shl = 8 - shr; 7564 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7565 uint64_t *d = vd, *n = vn, *m = vm; 7566 7567 for (i = 0; i < opr_sz; ++i) { 7568 uint64_t t = n[i] ^ m[i]; 7569 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7570 } 7571 } 7572 7573 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7574 { 7575 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7576 int shr = simd_data(desc); 7577 int shl = 16 - shr; 7578 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7579 uint64_t *d = vd, *n = vn, *m = vm; 7580 7581 for (i = 0; i < opr_sz; ++i) { 7582 uint64_t t = n[i] ^ m[i]; 7583 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7584 } 7585 } 7586 7587 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7588 { 7589 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7590 int shr = simd_data(desc); 7591 uint32_t *d = vd, *n = vn, *m = vm; 7592 7593 for (i = 0; i < opr_sz; ++i) { 7594 d[i] = ror32(n[i] ^ m[i], shr); 7595 } 7596 } 7597 7598 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7599 float_status *status, uint32_t desc) 7600 { 7601 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7602 7603 for (s = 0; s < opr_sz; ++s) { 7604 float32 *n = vn + s * sizeof(float32) * 4; 7605 float32 *m = vm + s * sizeof(float32) * 4; 7606 float32 *a = va + s * sizeof(float32) * 4; 7607 float32 *d = vd + s * sizeof(float32) * 4; 7608 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7609 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7610 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7611 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7612 float32 p0, p1; 7613 7614 /* i = 0, j = 0 */ 7615 p0 = float32_mul(n00, m00, status); 7616 p1 = float32_mul(n01, m01, status); 7617 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7618 7619 /* i = 0, j = 1 */ 7620 p0 = float32_mul(n00, m10, status); 7621 p1 = float32_mul(n01, m11, status); 7622 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7623 7624 /* i = 1, j = 0 */ 7625 p0 = float32_mul(n10, m00, status); 7626 p1 = float32_mul(n11, m01, status); 7627 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7628 7629 /* i = 1, j = 1 */ 7630 p0 = float32_mul(n10, m10, status); 7631 p1 = float32_mul(n11, m11, status); 7632 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7633 } 7634 } 7635 7636 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7637 float_status *status, uint32_t desc) 7638 { 7639 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7640 7641 for (s = 0; s < opr_sz; ++s) { 7642 float64 *n = vn + s * sizeof(float64) * 4; 7643 float64 *m = vm + s * sizeof(float64) * 4; 7644 float64 *a = va + s * sizeof(float64) * 4; 7645 float64 *d = vd + s * sizeof(float64) * 4; 7646 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7647 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7648 float64 p0, p1; 7649 7650 /* i = 0, j = 0 */ 7651 p0 = float64_mul(n00, m00, status); 7652 p1 = float64_mul(n01, m01, status); 7653 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7654 7655 /* i = 0, j = 1 */ 7656 p0 = float64_mul(n00, m10, status); 7657 p1 = float64_mul(n01, m11, status); 7658 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7659 7660 /* i = 1, j = 0 */ 7661 p0 = float64_mul(n10, m00, status); 7662 p1 = float64_mul(n11, m01, status); 7663 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7664 7665 /* i = 1, j = 1 */ 7666 p0 = float64_mul(n10, m10, status); 7667 p1 = float64_mul(n11, m11, status); 7668 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7669 } 7670 } 7671 7672 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7673 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7674 float_status *status, uint32_t desc) \ 7675 { \ 7676 intptr_t i = simd_oprsz(desc); \ 7677 uint64_t *g = vg; \ 7678 do { \ 7679 uint64_t pg = g[(i - 1) >> 6]; \ 7680 do { \ 7681 i -= sizeof(TYPEW); \ 7682 if (likely((pg >> (i & 63)) & 1)) { \ 7683 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7684 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7685 } \ 7686 } while (i & 63); \ 7687 } while (i != 0); \ 7688 } 7689 7690 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7691 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7692 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7693 7694 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7695 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7696 float_status *status, uint32_t desc) \ 7697 { \ 7698 intptr_t i = simd_oprsz(desc); \ 7699 uint64_t *g = vg; \ 7700 do { \ 7701 uint64_t pg = g[(i - 1) >> 6]; \ 7702 do { \ 7703 i -= sizeof(TYPEW); \ 7704 if (likely((pg >> (i & 63)) & 1)) { \ 7705 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7706 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7707 } \ 7708 } while (i & 63); \ 7709 } while (i != 0); \ 7710 } 7711 7712 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7713 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7714 7715 #undef DO_FCVTLT 7716 #undef DO_FCVTNT 7717