1 /* 2 * ARM SVE Operations 3 * 4 * Copyright (c) 2018 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "exec/exec-all.h" 24 #include "exec/page-protection.h" 25 #include "exec/helper-proto.h" 26 #include "tcg/tcg-gvec-desc.h" 27 #include "fpu/softfloat.h" 28 #include "tcg/tcg.h" 29 #include "vec_internal.h" 30 #include "sve_ldst_internal.h" 31 #include "hw/core/tcg-cpu-ops.h" 32 #ifdef CONFIG_USER_ONLY 33 #include "user/page-protection.h" 34 #endif 35 36 37 /* Return a value for NZCV as per the ARM PredTest pseudofunction. 38 * 39 * The return value has bit 31 set if N is set, bit 1 set if Z is clear, 40 * and bit 0 set if C is set. Compare the definitions of these variables 41 * within CPUARMState. 42 */ 43 44 /* For no G bits set, NZCV = C. */ 45 #define PREDTEST_INIT 1 46 47 /* This is an iterative function, called for each Pd and Pg word 48 * moving forward. 49 */ 50 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) 51 { 52 if (likely(g)) { 53 /* Compute N from first D & G. 54 Use bit 2 to signal first G bit seen. */ 55 if (!(flags & 4)) { 56 flags |= ((d & (g & -g)) != 0) << 31; 57 flags |= 4; 58 } 59 60 /* Accumulate Z from each D & G. */ 61 flags |= ((d & g) != 0) << 1; 62 63 /* Compute C from last !(D & G). Replace previous. */ 64 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); 65 } 66 return flags; 67 } 68 69 /* This is an iterative function, called for each Pd and Pg word 70 * moving backward. 71 */ 72 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags) 73 { 74 if (likely(g)) { 75 /* Compute C from first (i.e last) !(D & G). 76 Use bit 2 to signal first G bit seen. */ 77 if (!(flags & 4)) { 78 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */ 79 flags |= (d & pow2floor(g)) == 0; 80 } 81 82 /* Accumulate Z from each D & G. */ 83 flags |= ((d & g) != 0) << 1; 84 85 /* Compute N from last (i.e first) D & G. Replace previous. */ 86 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0); 87 } 88 return flags; 89 } 90 91 /* The same for a single word predicate. */ 92 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) 93 { 94 return iter_predtest_fwd(d, g, PREDTEST_INIT); 95 } 96 97 /* The same for a multi-word predicate. */ 98 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) 99 { 100 uint32_t flags = PREDTEST_INIT; 101 uint64_t *d = vd, *g = vg; 102 uintptr_t i = 0; 103 104 do { 105 flags = iter_predtest_fwd(d[i], g[i], flags); 106 } while (++i < words); 107 108 return flags; 109 } 110 111 /* Similarly for single word elements. */ 112 static inline uint64_t expand_pred_s(uint8_t byte) 113 { 114 static const uint64_t word[] = { 115 [0x01] = 0x00000000ffffffffull, 116 [0x10] = 0xffffffff00000000ull, 117 [0x11] = 0xffffffffffffffffull, 118 }; 119 return word[byte & 0x11]; 120 } 121 122 #define LOGICAL_PPPP(NAME, FUNC) \ 123 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 124 { \ 125 uintptr_t opr_sz = simd_oprsz(desc); \ 126 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ 127 uintptr_t i; \ 128 for (i = 0; i < opr_sz / 8; ++i) { \ 129 d[i] = FUNC(n[i], m[i], g[i]); \ 130 } \ 131 } 132 133 #define DO_AND(N, M, G) (((N) & (M)) & (G)) 134 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) 135 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) 136 #define DO_ORR(N, M, G) (((N) | (M)) & (G)) 137 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) 138 #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) 139 #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) 140 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) 141 142 LOGICAL_PPPP(sve_and_pppp, DO_AND) 143 LOGICAL_PPPP(sve_bic_pppp, DO_BIC) 144 LOGICAL_PPPP(sve_eor_pppp, DO_EOR) 145 LOGICAL_PPPP(sve_sel_pppp, DO_SEL) 146 LOGICAL_PPPP(sve_orr_pppp, DO_ORR) 147 LOGICAL_PPPP(sve_orn_pppp, DO_ORN) 148 LOGICAL_PPPP(sve_nor_pppp, DO_NOR) 149 LOGICAL_PPPP(sve_nand_pppp, DO_NAND) 150 151 #undef DO_AND 152 #undef DO_BIC 153 #undef DO_EOR 154 #undef DO_ORR 155 #undef DO_ORN 156 #undef DO_NOR 157 #undef DO_NAND 158 #undef DO_SEL 159 #undef LOGICAL_PPPP 160 161 /* Fully general three-operand expander, controlled by a predicate. 162 * This is complicated by the host-endian storage of the register file. 163 */ 164 /* ??? I don't expect the compiler could ever vectorize this itself. 165 * With some tables we can convert bit masks to byte masks, and with 166 * extra care wrt byte/word ordering we could use gcc generic vectors 167 * and do 16 bytes at a time. 168 */ 169 #define DO_ZPZZ(NAME, TYPE, H, OP) \ 170 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 171 { \ 172 intptr_t i, opr_sz = simd_oprsz(desc); \ 173 for (i = 0; i < opr_sz; ) { \ 174 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 175 do { \ 176 if (pg & 1) { \ 177 TYPE nn = *(TYPE *)(vn + H(i)); \ 178 TYPE mm = *(TYPE *)(vm + H(i)); \ 179 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 180 } \ 181 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 182 } while (i & 15); \ 183 } \ 184 } 185 186 /* Similarly, specialized for 64-bit operands. */ 187 #define DO_ZPZZ_D(NAME, TYPE, OP) \ 188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 189 { \ 190 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 191 TYPE *d = vd, *n = vn, *m = vm; \ 192 uint8_t *pg = vg; \ 193 for (i = 0; i < opr_sz; i += 1) { \ 194 if (pg[H1(i)] & 1) { \ 195 TYPE nn = n[i], mm = m[i]; \ 196 d[i] = OP(nn, mm); \ 197 } \ 198 } \ 199 } 200 201 #define DO_AND(N, M) (N & M) 202 #define DO_EOR(N, M) (N ^ M) 203 #define DO_ORR(N, M) (N | M) 204 #define DO_BIC(N, M) (N & ~M) 205 #define DO_ADD(N, M) (N + M) 206 #define DO_SUB(N, M) (N - M) 207 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 208 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 209 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 210 #define DO_MUL(N, M) (N * M) 211 212 213 /* 214 * We must avoid the C undefined behaviour cases: division by 215 * zero and signed division of INT_MIN by -1. Both of these 216 * have architecturally defined required results for Arm. 217 * We special case all signed divisions by -1 to avoid having 218 * to deduce the minimum integer for the type involved. 219 */ 220 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) 221 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) 222 223 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) 224 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) 225 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) 226 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) 227 228 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) 229 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) 230 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) 231 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) 232 233 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) 234 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) 235 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) 236 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) 237 238 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) 239 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) 240 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) 241 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) 242 243 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) 244 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) 245 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) 246 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) 247 248 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) 249 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) 250 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) 251 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) 252 253 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) 254 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) 255 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) 256 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) 257 258 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) 259 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) 260 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) 261 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) 262 263 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) 264 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) 265 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) 266 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) 267 268 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) 269 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) 270 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) 271 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) 272 273 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) 274 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) 275 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) 276 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) 277 278 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) 279 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) 280 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) 281 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) 282 283 /* Because the computation type is at least twice as large as required, 284 these work for both signed and unsigned source types. */ 285 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 286 { 287 return (n * m) >> 8; 288 } 289 290 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 291 { 292 return (n * m) >> 16; 293 } 294 295 static inline uint32_t do_mulh_s(int64_t n, int64_t m) 296 { 297 return (n * m) >> 32; 298 } 299 300 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) 301 { 302 uint64_t lo, hi; 303 muls64(&lo, &hi, n, m); 304 return hi; 305 } 306 307 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) 308 { 309 uint64_t lo, hi; 310 mulu64(&lo, &hi, n, m); 311 return hi; 312 } 313 314 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) 315 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) 316 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) 317 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) 318 319 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) 320 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) 321 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) 322 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) 323 324 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) 325 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) 326 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) 327 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) 328 329 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) 330 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) 331 332 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) 333 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) 334 335 /* Note that all bits of the shift are significant 336 and not modulo the element size. */ 337 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) 338 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) 339 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) 340 341 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) 342 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) 343 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) 344 345 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) 346 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) 347 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) 348 349 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) 350 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) 351 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) 352 353 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) 354 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) 355 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) 356 357 static inline uint16_t do_sadalp_h(int16_t n, int16_t m) 358 { 359 int8_t n1 = n, n2 = n >> 8; 360 return m + n1 + n2; 361 } 362 363 static inline uint32_t do_sadalp_s(int32_t n, int32_t m) 364 { 365 int16_t n1 = n, n2 = n >> 16; 366 return m + n1 + n2; 367 } 368 369 static inline uint64_t do_sadalp_d(int64_t n, int64_t m) 370 { 371 int32_t n1 = n, n2 = n >> 32; 372 return m + n1 + n2; 373 } 374 375 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h) 376 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s) 377 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d) 378 379 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m) 380 { 381 uint8_t n1 = n, n2 = n >> 8; 382 return m + n1 + n2; 383 } 384 385 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m) 386 { 387 uint16_t n1 = n, n2 = n >> 16; 388 return m + n1 + n2; 389 } 390 391 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m) 392 { 393 uint32_t n1 = n, n2 = n >> 32; 394 return m + n1 + n2; 395 } 396 397 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h) 398 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s) 399 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d) 400 401 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL) 402 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL) 403 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL) 404 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL) 405 406 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b) 407 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h) 408 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s) 409 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d) 410 411 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL) 412 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL) 413 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL) 414 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL) 415 416 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b) 417 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h) 418 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s) 419 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d) 420 421 /* 422 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set. 423 * We pass in a pointer to a dummy saturation field to trigger 424 * the saturating arithmetic but discard the information about 425 * whether it has occurred. 426 */ 427 #define do_sqshl_b(n, m) \ 428 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); }) 429 #define do_sqshl_h(n, m) \ 430 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); }) 431 #define do_sqshl_s(n, m) \ 432 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); }) 433 #define do_sqshl_d(n, m) \ 434 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); }) 435 436 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b) 437 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h) 438 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s) 439 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d) 440 441 #define do_uqshl_b(n, m) \ 442 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 443 #define do_uqshl_h(n, m) \ 444 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 445 #define do_uqshl_s(n, m) \ 446 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); }) 447 #define do_uqshl_d(n, m) \ 448 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); }) 449 450 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b) 451 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h) 452 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s) 453 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d) 454 455 #define do_sqrshl_b(n, m) \ 456 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); }) 457 #define do_sqrshl_h(n, m) \ 458 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); }) 459 #define do_sqrshl_s(n, m) \ 460 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); }) 461 #define do_sqrshl_d(n, m) \ 462 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); }) 463 464 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b) 465 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h) 466 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s) 467 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d) 468 469 #undef do_sqrshl_d 470 471 #define do_uqrshl_b(n, m) \ 472 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); }) 473 #define do_uqrshl_h(n, m) \ 474 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); }) 475 #define do_uqrshl_s(n, m) \ 476 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); }) 477 #define do_uqrshl_d(n, m) \ 478 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); }) 479 480 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b) 481 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h) 482 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s) 483 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d) 484 485 #undef do_uqrshl_d 486 487 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1) 488 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1)) 489 490 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS) 491 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS) 492 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS) 493 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D) 494 495 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS) 496 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS) 497 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS) 498 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D) 499 500 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1) 501 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1)) 502 503 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS) 504 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS) 505 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS) 506 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D) 507 508 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS) 509 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS) 510 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS) 511 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D) 512 513 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1) 514 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1)) 515 516 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS) 517 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS) 518 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS) 519 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D) 520 521 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS) 522 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS) 523 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS) 524 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D) 525 526 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max) 527 { 528 return val >= max ? max : val <= min ? min : val; 529 } 530 531 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX) 532 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX) 533 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX) 534 535 static inline int64_t do_sqadd_d(int64_t n, int64_t m) 536 { 537 int64_t r = n + m; 538 if (((r ^ n) & ~(n ^ m)) < 0) { 539 /* Signed overflow. */ 540 return r < 0 ? INT64_MAX : INT64_MIN; 541 } 542 return r; 543 } 544 545 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B) 546 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H) 547 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S) 548 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d) 549 550 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX) 551 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX) 552 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX) 553 554 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m) 555 { 556 uint64_t r = n + m; 557 return r < n ? UINT64_MAX : r; 558 } 559 560 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B) 561 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H) 562 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S) 563 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d) 564 565 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX) 566 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX) 567 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX) 568 569 static inline int64_t do_sqsub_d(int64_t n, int64_t m) 570 { 571 int64_t r = n - m; 572 if (((r ^ n) & (n ^ m)) < 0) { 573 /* Signed overflow. */ 574 return r < 0 ? INT64_MAX : INT64_MIN; 575 } 576 return r; 577 } 578 579 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B) 580 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H) 581 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S) 582 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d) 583 584 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX) 585 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX) 586 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX) 587 588 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m) 589 { 590 return n > m ? n - m : 0; 591 } 592 593 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B) 594 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H) 595 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S) 596 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d) 597 598 #define DO_SUQADD_B(n, m) \ 599 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX) 600 #define DO_SUQADD_H(n, m) \ 601 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX) 602 #define DO_SUQADD_S(n, m) \ 603 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX) 604 605 static inline int64_t do_suqadd_d(int64_t n, uint64_t m) 606 { 607 uint64_t r = n + m; 608 609 if (n < 0) { 610 /* Note that m - abs(n) cannot underflow. */ 611 if (r > INT64_MAX) { 612 /* Result is either very large positive or negative. */ 613 if (m > -n) { 614 /* m > abs(n), so r is a very large positive. */ 615 return INT64_MAX; 616 } 617 /* Result is negative. */ 618 } 619 } else { 620 /* Both inputs are positive: check for overflow. */ 621 if (r < m || r > INT64_MAX) { 622 return INT64_MAX; 623 } 624 } 625 return r; 626 } 627 628 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B) 629 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H) 630 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S) 631 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d) 632 633 #define DO_USQADD_B(n, m) \ 634 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX) 635 #define DO_USQADD_H(n, m) \ 636 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX) 637 #define DO_USQADD_S(n, m) \ 638 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX) 639 640 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m) 641 { 642 uint64_t r = n + m; 643 644 if (m < 0) { 645 return n < -m ? 0 : r; 646 } 647 return r < n ? UINT64_MAX : r; 648 } 649 650 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B) 651 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H) 652 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S) 653 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d) 654 655 #undef DO_ZPZZ 656 #undef DO_ZPZZ_D 657 658 /* 659 * Three operand expander, operating on element pairs. 660 * If the slot I is even, the elements from from VN {I, I+1}. 661 * If the slot I is odd, the elements from from VM {I-1, I}. 662 * Load all of the input elements in each pair before overwriting output. 663 */ 664 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \ 665 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 666 { \ 667 intptr_t i, opr_sz = simd_oprsz(desc); \ 668 for (i = 0; i < opr_sz; ) { \ 669 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 670 do { \ 671 TYPE n0 = *(TYPE *)(vn + H(i)); \ 672 TYPE m0 = *(TYPE *)(vm + H(i)); \ 673 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 674 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 675 if (pg & 1) { \ 676 *(TYPE *)(vd + H(i)) = OP(n0, n1); \ 677 } \ 678 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 679 if (pg & 1) { \ 680 *(TYPE *)(vd + H(i)) = OP(m0, m1); \ 681 } \ 682 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 683 } while (i & 15); \ 684 } \ 685 } 686 687 /* Similarly, specialized for 64-bit operands. */ 688 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \ 689 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 690 { \ 691 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 692 TYPE *d = vd, *n = vn, *m = vm; \ 693 uint8_t *pg = vg; \ 694 for (i = 0; i < opr_sz; i += 2) { \ 695 TYPE n0 = n[i], n1 = n[i + 1]; \ 696 TYPE m0 = m[i], m1 = m[i + 1]; \ 697 if (pg[H1(i)] & 1) { \ 698 d[i] = OP(n0, n1); \ 699 } \ 700 if (pg[H1(i + 1)] & 1) { \ 701 d[i + 1] = OP(m0, m1); \ 702 } \ 703 } \ 704 } 705 706 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD) 707 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD) 708 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD) 709 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD) 710 711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX) 712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX) 713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX) 714 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX) 715 716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN) 717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN) 718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN) 719 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN) 720 721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX) 722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX) 723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX) 724 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX) 725 726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN) 727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN) 728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN) 729 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN) 730 731 #undef DO_ZPZZ_PAIR 732 #undef DO_ZPZZ_PAIR_D 733 734 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \ 735 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 736 float_status *status, uint32_t desc) \ 737 { \ 738 intptr_t i, opr_sz = simd_oprsz(desc); \ 739 for (i = 0; i < opr_sz; ) { \ 740 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 741 do { \ 742 TYPE n0 = *(TYPE *)(vn + H(i)); \ 743 TYPE m0 = *(TYPE *)(vm + H(i)); \ 744 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 745 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 746 if (pg & 1) { \ 747 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \ 748 } \ 749 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 750 if (pg & 1) { \ 751 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \ 752 } \ 753 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 754 } while (i & 15); \ 755 } \ 756 } 757 758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add) 759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add) 760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add) 761 762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum) 763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum) 764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum) 765 766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum) 767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum) 768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum) 769 770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max) 771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max) 772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max) 773 774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min) 775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min) 776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min) 777 778 #undef DO_ZPZZ_PAIR_FP 779 780 /* Three-operand expander, controlled by a predicate, in which the 781 * third operand is "wide". That is, for D = N op M, the same 64-bit 782 * value of M is used with all of the narrower values of N. 783 */ 784 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ 785 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 786 { \ 787 intptr_t i, opr_sz = simd_oprsz(desc); \ 788 for (i = 0; i < opr_sz; ) { \ 789 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ 790 TYPEW mm = *(TYPEW *)(vm + i); \ 791 do { \ 792 if (pg & 1) { \ 793 TYPE nn = *(TYPE *)(vn + H(i)); \ 794 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 795 } \ 796 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 797 } while (i & 7); \ 798 } \ 799 } 800 801 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) 802 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) 803 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) 804 805 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) 806 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 807 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 808 809 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) 810 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 811 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 812 813 #undef DO_ZPZW 814 815 /* Fully general two-operand expander, controlled by a predicate. 816 */ 817 #define DO_ZPZ(NAME, TYPE, H, OP) \ 818 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 819 { \ 820 intptr_t i, opr_sz = simd_oprsz(desc); \ 821 for (i = 0; i < opr_sz; ) { \ 822 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 823 do { \ 824 if (pg & 1) { \ 825 TYPE nn = *(TYPE *)(vn + H(i)); \ 826 *(TYPE *)(vd + H(i)) = OP(nn); \ 827 } \ 828 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 829 } while (i & 15); \ 830 } \ 831 } 832 833 /* Similarly, specialized for 64-bit operands. */ 834 #define DO_ZPZ_D(NAME, TYPE, OP) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 836 { \ 837 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 838 TYPE *d = vd, *n = vn; \ 839 uint8_t *pg = vg; \ 840 for (i = 0; i < opr_sz; i += 1) { \ 841 if (pg[H1(i)] & 1) { \ 842 TYPE nn = n[i]; \ 843 d[i] = OP(nn); \ 844 } \ 845 } \ 846 } 847 848 #define DO_CLS_B(N) (clrsb32(N) - 24) 849 #define DO_CLS_H(N) (clrsb32(N) - 16) 850 851 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) 852 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) 853 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) 854 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) 855 856 #define DO_CLZ_B(N) (clz32(N) - 24) 857 #define DO_CLZ_H(N) (clz32(N) - 16) 858 859 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) 860 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) 861 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) 862 DO_ZPZ_D(sve_clz_d, uint64_t, clz64) 863 864 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) 865 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) 866 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) 867 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) 868 869 #define DO_CNOT(N) (N == 0) 870 871 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) 872 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) 873 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) 874 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) 875 876 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) 877 878 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) 879 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) 880 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) 881 882 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N)) 883 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N)) 884 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N)) 885 886 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H) 887 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S) 888 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D) 889 890 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) 891 892 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) 893 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) 894 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) 895 896 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N)) 897 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N)) 898 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N)) 899 900 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H) 901 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S) 902 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D) 903 904 #define DO_NOT(N) (~N) 905 906 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) 907 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) 908 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) 909 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) 910 911 #define DO_SXTB(N) ((int8_t)N) 912 #define DO_SXTH(N) ((int16_t)N) 913 #define DO_SXTS(N) ((int32_t)N) 914 #define DO_UXTB(N) ((uint8_t)N) 915 #define DO_UXTH(N) ((uint16_t)N) 916 #define DO_UXTS(N) ((uint32_t)N) 917 918 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) 919 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) 920 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) 921 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) 922 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) 923 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) 924 925 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) 926 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) 927 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) 928 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) 929 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) 930 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) 931 932 #define DO_ABS(N) (N < 0 ? -N : N) 933 934 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) 935 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) 936 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) 937 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) 938 939 #define DO_NEG(N) (-N) 940 941 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) 942 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) 943 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) 944 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) 945 946 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16) 947 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32) 948 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64) 949 950 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32) 951 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64) 952 953 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64) 954 955 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc) 956 { 957 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 958 uint64_t *d = vd, *n = vn; 959 uint8_t *pg = vg; 960 961 for (i = 0; i < opr_sz; i += 2) { 962 if (pg[H1(i)] & 1) { 963 uint64_t n0 = n[i + 0]; 964 uint64_t n1 = n[i + 1]; 965 d[i + 0] = n1; 966 d[i + 1] = n0; 967 } 968 } 969 } 970 971 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8) 972 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16) 973 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32) 974 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64) 975 976 #define DO_SQABS(X) \ 977 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 978 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; }) 979 980 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS) 981 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS) 982 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS) 983 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS) 984 985 #define DO_SQNEG(X) \ 986 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \ 987 x_ == min_ ? -min_ - 1 : -x_; }) 988 989 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG) 990 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG) 991 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG) 992 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG) 993 994 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32) 995 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32) 996 997 /* Three-operand expander, unpredicated, in which the third operand is "wide". 998 */ 999 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ 1000 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1001 { \ 1002 intptr_t i, opr_sz = simd_oprsz(desc); \ 1003 for (i = 0; i < opr_sz; ) { \ 1004 TYPEW mm = *(TYPEW *)(vm + i); \ 1005 do { \ 1006 TYPE nn = *(TYPE *)(vn + H(i)); \ 1007 *(TYPE *)(vd + H(i)) = OP(nn, mm); \ 1008 i += sizeof(TYPE); \ 1009 } while (i & 7); \ 1010 } \ 1011 } 1012 1013 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) 1014 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) 1015 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) 1016 1017 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) 1018 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) 1019 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) 1020 1021 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) 1022 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) 1023 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) 1024 1025 #undef DO_ZZW 1026 1027 #undef DO_CLS_B 1028 #undef DO_CLS_H 1029 #undef DO_CLZ_B 1030 #undef DO_CLZ_H 1031 #undef DO_CNOT 1032 #undef DO_FABS 1033 #undef DO_FNEG 1034 #undef DO_ABS 1035 #undef DO_NEG 1036 #undef DO_ZPZ 1037 #undef DO_ZPZ_D 1038 1039 /* 1040 * Three-operand expander, unpredicated, in which the two inputs are 1041 * selected from the top or bottom half of the wide column. 1042 */ 1043 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1044 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1045 { \ 1046 intptr_t i, opr_sz = simd_oprsz(desc); \ 1047 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1048 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1049 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1050 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1051 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1052 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1053 } \ 1054 } 1055 1056 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1057 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1058 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1059 1060 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1061 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1062 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1063 1064 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1065 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1066 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1067 1068 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1069 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1070 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1071 1072 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1073 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1074 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1075 1076 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1077 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1078 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1079 1080 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1081 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1082 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1083 1084 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1085 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1086 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1087 1088 /* Note that the multiply cannot overflow, but the doubling can. */ 1089 static inline int16_t do_sqdmull_h(int16_t n, int16_t m) 1090 { 1091 int16_t val = n * m; 1092 return DO_SQADD_H(val, val); 1093 } 1094 1095 static inline int32_t do_sqdmull_s(int32_t n, int32_t m) 1096 { 1097 int32_t val = n * m; 1098 return DO_SQADD_S(val, val); 1099 } 1100 1101 static inline int64_t do_sqdmull_d(int64_t n, int64_t m) 1102 { 1103 int64_t val = n * m; 1104 return do_sqadd_d(val, val); 1105 } 1106 1107 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h) 1108 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1109 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1110 1111 #undef DO_ZZZ_TB 1112 1113 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1114 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1115 { \ 1116 intptr_t i, opr_sz = simd_oprsz(desc); \ 1117 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1118 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1119 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 1120 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1121 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \ 1122 } \ 1123 } 1124 1125 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD) 1126 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD) 1127 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD) 1128 1129 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB) 1130 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB) 1131 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB) 1132 1133 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD) 1134 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD) 1135 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD) 1136 1137 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB) 1138 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB) 1139 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB) 1140 1141 #undef DO_ZZZ_WTB 1142 1143 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \ 1144 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1145 { \ 1146 intptr_t i, opr_sz = simd_oprsz(desc); \ 1147 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \ 1148 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \ 1149 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1150 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \ 1151 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \ 1152 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \ 1153 } \ 1154 } 1155 1156 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR) 1157 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR) 1158 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR) 1159 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR) 1160 1161 #undef DO_ZZZ_NTB 1162 1163 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1164 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1165 { \ 1166 intptr_t i, opr_sz = simd_oprsz(desc); \ 1167 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \ 1168 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1169 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1170 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \ 1171 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1172 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \ 1173 } \ 1174 } 1175 1176 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD) 1177 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD) 1178 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD) 1179 1180 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD) 1181 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD) 1182 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD) 1183 1184 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL) 1185 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1186 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1187 1188 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL) 1189 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1190 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1191 1192 #define DO_NMUL(N, M) -(N * M) 1193 1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL) 1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL) 1196 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL) 1197 1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL) 1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL) 1200 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL) 1201 1202 #undef DO_ZZZW_ACC 1203 1204 #define DO_XTNB(NAME, TYPE, OP) \ 1205 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1206 { \ 1207 intptr_t i, opr_sz = simd_oprsz(desc); \ 1208 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1209 TYPE nn = *(TYPE *)(vn + i); \ 1210 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \ 1211 *(TYPE *)(vd + i) = nn; \ 1212 } \ 1213 } 1214 1215 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \ 1216 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1217 { \ 1218 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \ 1219 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1220 TYPE nn = *(TYPE *)(vn + i); \ 1221 *(TYPEN *)(vd + i + odd) = OP(nn); \ 1222 } \ 1223 } 1224 1225 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX) 1226 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX) 1227 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX) 1228 1229 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H) 1230 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S) 1231 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D) 1232 1233 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H) 1234 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S) 1235 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D) 1236 1237 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX) 1238 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX) 1239 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX) 1240 1241 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H) 1242 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S) 1243 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D) 1244 1245 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H) 1246 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S) 1247 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D) 1248 1249 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H) 1250 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S) 1251 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D) 1252 1253 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H) 1254 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S) 1255 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D) 1256 1257 #undef DO_XTNB 1258 #undef DO_XTNT 1259 1260 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1261 { 1262 intptr_t i, opr_sz = simd_oprsz(desc); 1263 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1)); 1264 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1265 uint32_t *a = va, *n = vn; 1266 uint64_t *d = vd, *m = vm; 1267 1268 for (i = 0; i < opr_sz / 8; ++i) { 1269 uint32_t e1 = a[2 * i + H4(0)]; 1270 uint32_t e2 = n[2 * i + sel] ^ inv; 1271 uint64_t c = extract64(m[i], 32, 1); 1272 /* Compute and store the entire 33-bit result at once. */ 1273 d[i] = c + e1 + e2; 1274 } 1275 } 1276 1277 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 1278 { 1279 intptr_t i, opr_sz = simd_oprsz(desc); 1280 int sel = extract32(desc, SIMD_DATA_SHIFT, 1); 1281 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1282 uint64_t *d = vd, *a = va, *n = vn, *m = vm; 1283 1284 for (i = 0; i < opr_sz / 8; i += 2) { 1285 Int128 e1 = int128_make64(a[i]); 1286 Int128 e2 = int128_make64(n[i + sel] ^ inv); 1287 Int128 c = int128_make64(m[i + 1] & 1); 1288 Int128 r = int128_add(int128_add(e1, e2), c); 1289 d[i + 0] = int128_getlo(r); 1290 d[i + 1] = int128_gethi(r); 1291 } 1292 } 1293 1294 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \ 1295 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1296 { \ 1297 intptr_t i, opr_sz = simd_oprsz(desc); \ 1298 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1299 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \ 1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \ 1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \ 1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \ 1304 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \ 1305 } \ 1306 } 1307 1308 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1, 1309 do_sqdmull_h, DO_SQADD_H) 1310 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1311 do_sqdmull_s, DO_SQADD_S) 1312 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1313 do_sqdmull_d, do_sqadd_d) 1314 1315 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1, 1316 do_sqdmull_h, DO_SQSUB_H) 1317 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, 1318 do_sqdmull_s, DO_SQSUB_S) 1319 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, 1320 do_sqdmull_d, do_sqsub_d) 1321 1322 #undef DO_SQDMLAL 1323 1324 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \ 1325 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1326 { \ 1327 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1328 int rot = simd_data(desc); \ 1329 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1330 bool sub_r = rot == 1 || rot == 2; \ 1331 bool sub_i = rot >= 2; \ 1332 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1333 for (i = 0; i < opr_sz; i += 2) { \ 1334 TYPE elt1_a = n[H(i + sel_a)]; \ 1335 TYPE elt2_a = m[H(i + sel_a)]; \ 1336 TYPE elt2_b = m[H(i + sel_b)]; \ 1337 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \ 1338 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \ 1339 } \ 1340 } 1341 1342 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1)) 1343 1344 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA) 1345 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA) 1346 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA) 1347 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA) 1348 1349 #define DO_SQRDMLAH_B(N, M, A, S) \ 1350 do_sqrdmlah_b(N, M, A, S, true) 1351 #define DO_SQRDMLAH_H(N, M, A, S) \ 1352 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); }) 1353 #define DO_SQRDMLAH_S(N, M, A, S) \ 1354 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); }) 1355 #define DO_SQRDMLAH_D(N, M, A, S) \ 1356 do_sqrdmlah_d(N, M, A, S, true) 1357 1358 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B) 1359 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H) 1360 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S) 1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D) 1362 1363 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \ 1364 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1365 { \ 1366 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1367 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \ 1368 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \ 1369 int sel_a = rot & 1, sel_b = sel_a ^ 1; \ 1370 bool sub_r = rot == 1 || rot == 2; \ 1371 bool sub_i = rot >= 2; \ 1372 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1373 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \ 1374 TYPE elt2_a = m[H(i + idx + sel_a)]; \ 1375 TYPE elt2_b = m[H(i + idx + sel_b)]; \ 1376 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \ 1377 TYPE elt1_a = n[H(i + j + sel_a)]; \ 1378 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \ 1379 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \ 1380 } \ 1381 } \ 1382 } 1383 1384 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA) 1385 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA) 1386 1387 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1388 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1389 1390 #undef DO_CMLA 1391 #undef DO_CMLA_FUNC 1392 #undef DO_CMLA_IDX_FUNC 1393 #undef DO_SQRDMLAH_B 1394 #undef DO_SQRDMLAH_H 1395 #undef DO_SQRDMLAH_S 1396 #undef DO_SQRDMLAH_D 1397 1398 /* Note N and M are 4 elements bundled into one unit. */ 1399 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a, 1400 int sel_a, int sel_b, int sub_i) 1401 { 1402 for (int i = 0; i <= 1; i++) { 1403 int32_t elt1_r = (int8_t)(n >> (16 * i)); 1404 int32_t elt1_i = (int8_t)(n >> (16 * i + 8)); 1405 int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a)); 1406 int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b)); 1407 1408 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1409 } 1410 return a; 1411 } 1412 1413 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a, 1414 int sel_a, int sel_b, int sub_i) 1415 { 1416 for (int i = 0; i <= 1; i++) { 1417 int64_t elt1_r = (int16_t)(n >> (32 * i + 0)); 1418 int64_t elt1_i = (int16_t)(n >> (32 * i + 16)); 1419 int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a)); 1420 int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b)); 1421 1422 a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i; 1423 } 1424 return a; 1425 } 1426 1427 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm, 1428 void *va, uint32_t desc) 1429 { 1430 int opr_sz = simd_oprsz(desc); 1431 int rot = simd_data(desc); 1432 int sel_a = rot & 1; 1433 int sel_b = sel_a ^ 1; 1434 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1435 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1436 1437 for (int e = 0; e < opr_sz / 4; e++) { 1438 d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1439 } 1440 } 1441 1442 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm, 1443 void *va, uint32_t desc) 1444 { 1445 int opr_sz = simd_oprsz(desc); 1446 int rot = simd_data(desc); 1447 int sel_a = rot & 1; 1448 int sel_b = sel_a ^ 1; 1449 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1450 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1451 1452 for (int e = 0; e < opr_sz / 8; e++) { 1453 d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i); 1454 } 1455 } 1456 1457 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm, 1458 void *va, uint32_t desc) 1459 { 1460 int opr_sz = simd_oprsz(desc); 1461 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1462 int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2)); 1463 int sel_a = rot & 1; 1464 int sel_b = sel_a ^ 1; 1465 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1466 uint32_t *d = vd, *n = vn, *m = vm, *a = va; 1467 1468 for (int seg = 0; seg < opr_sz / 4; seg += 4) { 1469 uint32_t seg_m = m[seg + idx]; 1470 for (int e = 0; e < 4; e++) { 1471 d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e], 1472 sel_a, sel_b, sub_i); 1473 } 1474 } 1475 } 1476 1477 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm, 1478 void *va, uint32_t desc) 1479 { 1480 int seg, opr_sz = simd_oprsz(desc); 1481 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); 1482 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1483 int sel_a = rot & 1; 1484 int sel_b = sel_a ^ 1; 1485 int sub_i = (rot == 0 || rot == 3 ? -1 : 1); 1486 uint64_t *d = vd, *n = vn, *m = vm, *a = va; 1487 1488 for (seg = 0; seg < opr_sz / 8; seg += 2) { 1489 uint64_t seg_m = m[seg + idx]; 1490 for (int e = 0; e < 2; e++) { 1491 d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e], 1492 sel_a, sel_b, sub_i); 1493 } 1494 } 1495 } 1496 1497 #define DO_ZZXZ(NAME, TYPE, H, OP) \ 1498 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1499 { \ 1500 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ 1501 intptr_t i, j, idx = simd_data(desc); \ 1502 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \ 1503 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1504 TYPE mm = m[i]; \ 1505 for (j = 0; j < segment; j++) { \ 1506 d[i + j] = OP(n[i + j], mm, a[i + j]); \ 1507 } \ 1508 } \ 1509 } 1510 1511 #define DO_SQRDMLAH_H(N, M, A) \ 1512 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); }) 1513 #define DO_SQRDMLAH_S(N, M, A) \ 1514 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); }) 1515 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true) 1516 1517 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H) 1518 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S) 1519 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D) 1520 1521 #define DO_SQRDMLSH_H(N, M, A) \ 1522 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); }) 1523 #define DO_SQRDMLSH_S(N, M, A) \ 1524 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); }) 1525 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true) 1526 1527 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H) 1528 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S) 1529 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D) 1530 1531 #undef DO_ZZXZ 1532 1533 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1534 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1535 { \ 1536 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1537 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1538 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1539 for (i = 0; i < oprsz; i += 16) { \ 1540 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1541 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1542 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1543 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \ 1544 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \ 1545 } \ 1546 } \ 1547 } 1548 1549 #define DO_MLA(N, M, A) (A + N * M) 1550 1551 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA) 1552 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA) 1553 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA) 1554 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA) 1555 1556 #define DO_MLS(N, M, A) (A - N * M) 1557 1558 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS) 1559 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS) 1560 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS) 1561 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS) 1562 1563 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M)) 1564 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M)) 1565 1566 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S) 1567 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D) 1568 1569 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M)) 1570 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M)) 1571 1572 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S) 1573 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D) 1574 1575 #undef DO_MLA 1576 #undef DO_MLS 1577 #undef DO_ZZXW 1578 1579 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \ 1580 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1581 { \ 1582 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1583 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \ 1584 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \ 1585 for (i = 0; i < oprsz; i += 16) { \ 1586 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \ 1587 for (j = 0; j < 16; j += sizeof(TYPEW)) { \ 1588 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \ 1589 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \ 1590 } \ 1591 } \ 1592 } 1593 1594 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s) 1595 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d) 1596 1597 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL) 1598 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL) 1599 1600 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL) 1601 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL) 1602 1603 #undef DO_ZZX 1604 1605 #define DO_BITPERM(NAME, TYPE, OP) \ 1606 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1607 { \ 1608 intptr_t i, opr_sz = simd_oprsz(desc); \ 1609 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 1610 TYPE nn = *(TYPE *)(vn + i); \ 1611 TYPE mm = *(TYPE *)(vm + i); \ 1612 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \ 1613 } \ 1614 } 1615 1616 static uint64_t bitextract(uint64_t data, uint64_t mask, int n) 1617 { 1618 uint64_t res = 0; 1619 int db, rb = 0; 1620 1621 for (db = 0; db < n; ++db) { 1622 if ((mask >> db) & 1) { 1623 res |= ((data >> db) & 1) << rb; 1624 ++rb; 1625 } 1626 } 1627 return res; 1628 } 1629 1630 DO_BITPERM(sve2_bext_b, uint8_t, bitextract) 1631 DO_BITPERM(sve2_bext_h, uint16_t, bitextract) 1632 DO_BITPERM(sve2_bext_s, uint32_t, bitextract) 1633 DO_BITPERM(sve2_bext_d, uint64_t, bitextract) 1634 1635 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n) 1636 { 1637 uint64_t res = 0; 1638 int rb, db = 0; 1639 1640 for (rb = 0; rb < n; ++rb) { 1641 if ((mask >> rb) & 1) { 1642 res |= ((data >> db) & 1) << rb; 1643 ++db; 1644 } 1645 } 1646 return res; 1647 } 1648 1649 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit) 1650 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit) 1651 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit) 1652 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit) 1653 1654 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n) 1655 { 1656 uint64_t resm = 0, resu = 0; 1657 int db, rbm = 0, rbu = 0; 1658 1659 for (db = 0; db < n; ++db) { 1660 uint64_t val = (data >> db) & 1; 1661 if ((mask >> db) & 1) { 1662 resm |= val << rbm++; 1663 } else { 1664 resu |= val << rbu++; 1665 } 1666 } 1667 1668 return resm | (resu << rbm); 1669 } 1670 1671 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup) 1672 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup) 1673 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup) 1674 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup) 1675 1676 #undef DO_BITPERM 1677 1678 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \ 1679 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1680 { \ 1681 intptr_t i, opr_sz = simd_oprsz(desc); \ 1682 int sub_r = simd_data(desc); \ 1683 if (sub_r) { \ 1684 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1685 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1686 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1687 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1688 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1689 acc_r = ADD_OP(acc_r, el2_i); \ 1690 acc_i = SUB_OP(acc_i, el2_r); \ 1691 *(TYPE *)(vd + H(i)) = acc_r; \ 1692 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1693 } \ 1694 } else { \ 1695 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \ 1696 TYPE acc_r = *(TYPE *)(vn + H(i)); \ 1697 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \ 1698 TYPE el2_r = *(TYPE *)(vm + H(i)); \ 1699 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \ 1700 acc_r = SUB_OP(acc_r, el2_i); \ 1701 acc_i = ADD_OP(acc_i, el2_r); \ 1702 *(TYPE *)(vd + H(i)) = acc_r; \ 1703 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \ 1704 } \ 1705 } \ 1706 } 1707 1708 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB) 1709 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB) 1710 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB) 1711 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB) 1712 1713 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B) 1714 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H) 1715 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S) 1716 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d) 1717 1718 #undef DO_CADD 1719 1720 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \ 1721 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1722 { \ 1723 intptr_t i, opr_sz = simd_oprsz(desc); \ 1724 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \ 1725 int shift = simd_data(desc) >> 1; \ 1726 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 1727 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \ 1728 *(TYPEW *)(vd + HW(i)) = nn << shift; \ 1729 } \ 1730 } 1731 1732 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1) 1733 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2) 1734 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4) 1735 1736 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1) 1737 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2) 1738 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4) 1739 1740 #undef DO_ZZI_SHLL 1741 1742 /* Two-operand reduction expander, controlled by a predicate. 1743 * The difference between TYPERED and TYPERET has to do with 1744 * sign-extension. E.g. for SMAX, TYPERED must be signed, 1745 * but TYPERET must be unsigned so that e.g. a 32-bit value 1746 * is not sign-extended to the ABI uint64_t return type. 1747 */ 1748 /* ??? If we were to vectorize this by hand the reduction ordering 1749 * would change. For integer operands, this is perfectly fine. 1750 */ 1751 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ 1752 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1753 { \ 1754 intptr_t i, opr_sz = simd_oprsz(desc); \ 1755 TYPERED ret = INIT; \ 1756 for (i = 0; i < opr_sz; ) { \ 1757 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 1758 do { \ 1759 if (pg & 1) { \ 1760 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ 1761 ret = OP(ret, nn); \ 1762 } \ 1763 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ 1764 } while (i & 15); \ 1765 } \ 1766 return (TYPERET)ret; \ 1767 } 1768 1769 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ 1770 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ 1771 { \ 1772 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 1773 TYPEE *n = vn; \ 1774 uint8_t *pg = vg; \ 1775 TYPER ret = INIT; \ 1776 for (i = 0; i < opr_sz; i += 1) { \ 1777 if (pg[H1(i)] & 1) { \ 1778 TYPEE nn = n[i]; \ 1779 ret = OP(ret, nn); \ 1780 } \ 1781 } \ 1782 return ret; \ 1783 } 1784 1785 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) 1786 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) 1787 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) 1788 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) 1789 1790 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) 1791 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) 1792 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) 1793 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) 1794 1795 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) 1796 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) 1797 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) 1798 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) 1799 1800 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1801 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1802 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1803 1804 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) 1805 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) 1806 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) 1807 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) 1808 1809 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) 1810 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) 1811 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) 1812 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) 1813 1814 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) 1815 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) 1816 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) 1817 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) 1818 1819 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) 1820 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) 1821 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) 1822 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) 1823 1824 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) 1825 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) 1826 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) 1827 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) 1828 1829 #undef DO_VPZ 1830 #undef DO_VPZ_D 1831 1832 /* Two vector operand, one scalar operand, unpredicated. */ 1833 #define DO_ZZI(NAME, TYPE, OP) \ 1834 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \ 1835 { \ 1836 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \ 1837 TYPE s = s64, *d = vd, *n = vn; \ 1838 for (i = 0; i < opr_sz; ++i) { \ 1839 d[i] = OP(n[i], s); \ 1840 } \ 1841 } 1842 1843 #define DO_SUBR(X, Y) (Y - X) 1844 1845 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR) 1846 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR) 1847 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR) 1848 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR) 1849 1850 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX) 1851 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX) 1852 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX) 1853 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX) 1854 1855 DO_ZZI(sve_smini_b, int8_t, DO_MIN) 1856 DO_ZZI(sve_smini_h, int16_t, DO_MIN) 1857 DO_ZZI(sve_smini_s, int32_t, DO_MIN) 1858 DO_ZZI(sve_smini_d, int64_t, DO_MIN) 1859 1860 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX) 1861 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX) 1862 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX) 1863 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX) 1864 1865 DO_ZZI(sve_umini_b, uint8_t, DO_MIN) 1866 DO_ZZI(sve_umini_h, uint16_t, DO_MIN) 1867 DO_ZZI(sve_umini_s, uint32_t, DO_MIN) 1868 DO_ZZI(sve_umini_d, uint64_t, DO_MIN) 1869 1870 #undef DO_ZZI 1871 1872 #undef DO_AND 1873 #undef DO_ORR 1874 #undef DO_EOR 1875 #undef DO_BIC 1876 #undef DO_ADD 1877 #undef DO_SUB 1878 #undef DO_MAX 1879 #undef DO_MIN 1880 #undef DO_ABD 1881 #undef DO_MUL 1882 #undef DO_DIV 1883 #undef DO_ASR 1884 #undef DO_LSR 1885 #undef DO_LSL 1886 #undef DO_SUBR 1887 1888 /* Similar to the ARM LastActiveElement pseudocode function, except the 1889 result is multiplied by the element size. This includes the not found 1890 indication; e.g. not found for esz=3 is -8. */ 1891 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) 1892 { 1893 uint64_t mask = pred_esz_masks[esz]; 1894 intptr_t i = words; 1895 1896 do { 1897 uint64_t this_g = g[--i] & mask; 1898 if (this_g) { 1899 return i * 64 + (63 - clz64(this_g)); 1900 } 1901 } while (i > 0); 1902 return (intptr_t)-1 << esz; 1903 } 1904 1905 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc) 1906 { 1907 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1908 uint32_t flags = PREDTEST_INIT; 1909 uint64_t *d = vd, *g = vg; 1910 intptr_t i = 0; 1911 1912 do { 1913 uint64_t this_d = d[i]; 1914 uint64_t this_g = g[i]; 1915 1916 if (this_g) { 1917 if (!(flags & 4)) { 1918 /* Set in D the first bit of G. */ 1919 this_d |= this_g & -this_g; 1920 d[i] = this_d; 1921 } 1922 flags = iter_predtest_fwd(this_d, this_g, flags); 1923 } 1924 } while (++i < words); 1925 1926 return flags; 1927 } 1928 1929 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) 1930 { 1931 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 1932 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 1933 uint32_t flags = PREDTEST_INIT; 1934 uint64_t *d = vd, *g = vg, esz_mask; 1935 intptr_t i, next; 1936 1937 next = last_active_element(vd, words, esz) + (1 << esz); 1938 esz_mask = pred_esz_masks[esz]; 1939 1940 /* Similar to the pseudocode for pnext, but scaled by ESZ 1941 so that we find the correct bit. */ 1942 if (next < words * 64) { 1943 uint64_t mask = -1; 1944 1945 if (next & 63) { 1946 mask = ~((1ull << (next & 63)) - 1); 1947 next &= -64; 1948 } 1949 do { 1950 uint64_t this_g = g[next / 64] & esz_mask & mask; 1951 if (this_g != 0) { 1952 next = (next & -64) + ctz64(this_g); 1953 break; 1954 } 1955 next += 64; 1956 mask = -1; 1957 } while (next < words * 64); 1958 } 1959 1960 i = 0; 1961 do { 1962 uint64_t this_d = 0; 1963 if (i == next / 64) { 1964 this_d = 1ull << (next & 63); 1965 } 1966 d[i] = this_d; 1967 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); 1968 } while (++i < words); 1969 1970 return flags; 1971 } 1972 1973 /* 1974 * Copy Zn into Zd, and store zero into inactive elements. 1975 * If inv, store zeros into the active elements. 1976 */ 1977 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) 1978 { 1979 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1980 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1981 uint64_t *d = vd, *n = vn; 1982 uint8_t *pg = vg; 1983 1984 for (i = 0; i < opr_sz; i += 1) { 1985 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv); 1986 } 1987 } 1988 1989 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) 1990 { 1991 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 1992 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 1993 uint64_t *d = vd, *n = vn; 1994 uint8_t *pg = vg; 1995 1996 for (i = 0; i < opr_sz; i += 1) { 1997 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv); 1998 } 1999 } 2000 2001 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) 2002 { 2003 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2004 uint64_t inv = -(uint64_t)(simd_data(desc) & 1); 2005 uint64_t *d = vd, *n = vn; 2006 uint8_t *pg = vg; 2007 2008 for (i = 0; i < opr_sz; i += 1) { 2009 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv); 2010 } 2011 } 2012 2013 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) 2014 { 2015 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2016 uint64_t *d = vd, *n = vn; 2017 uint8_t *pg = vg; 2018 uint8_t inv = simd_data(desc); 2019 2020 for (i = 0; i < opr_sz; i += 1) { 2021 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1); 2022 } 2023 } 2024 2025 /* Three-operand expander, immediate operand, controlled by a predicate. 2026 */ 2027 #define DO_ZPZI(NAME, TYPE, H, OP) \ 2028 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2029 { \ 2030 intptr_t i, opr_sz = simd_oprsz(desc); \ 2031 TYPE imm = simd_data(desc); \ 2032 for (i = 0; i < opr_sz; ) { \ 2033 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2034 do { \ 2035 if (pg & 1) { \ 2036 TYPE nn = *(TYPE *)(vn + H(i)); \ 2037 *(TYPE *)(vd + H(i)) = OP(nn, imm); \ 2038 } \ 2039 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2040 } while (i & 15); \ 2041 } \ 2042 } 2043 2044 /* Similarly, specialized for 64-bit operands. */ 2045 #define DO_ZPZI_D(NAME, TYPE, OP) \ 2046 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 2047 { \ 2048 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2049 TYPE *d = vd, *n = vn; \ 2050 TYPE imm = simd_data(desc); \ 2051 uint8_t *pg = vg; \ 2052 for (i = 0; i < opr_sz; i += 1) { \ 2053 if (pg[H1(i)] & 1) { \ 2054 TYPE nn = n[i]; \ 2055 d[i] = OP(nn, imm); \ 2056 } \ 2057 } \ 2058 } 2059 2060 #define DO_SHR(N, M) (N >> M) 2061 #define DO_SHL(N, M) (N << M) 2062 2063 /* Arithmetic shift right for division. This rounds negative numbers 2064 toward zero as per signed division. Therefore before shifting, 2065 when N is negative, add 2**M-1. */ 2066 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) 2067 2068 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 2069 { 2070 if (likely(sh < 64)) { 2071 return (x >> sh) + ((x >> (sh - 1)) & 1); 2072 } else if (sh == 64) { 2073 return x >> 63; 2074 } else { 2075 return 0; 2076 } 2077 } 2078 2079 static inline int64_t do_srshr(int64_t x, unsigned sh) 2080 { 2081 if (likely(sh < 64)) { 2082 return (x >> sh) + ((x >> (sh - 1)) & 1); 2083 } else { 2084 /* Rounding the sign bit always produces 0. */ 2085 return 0; 2086 } 2087 } 2088 2089 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) 2090 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) 2091 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) 2092 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) 2093 2094 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) 2095 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) 2096 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) 2097 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) 2098 2099 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) 2100 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) 2101 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) 2102 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) 2103 2104 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) 2105 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) 2106 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) 2107 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) 2108 2109 /* SVE2 bitwise shift by immediate */ 2110 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b) 2111 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h) 2112 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s) 2113 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d) 2114 2115 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b) 2116 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h) 2117 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s) 2118 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d) 2119 2120 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr) 2121 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr) 2122 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr) 2123 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr) 2124 2125 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr) 2126 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr) 2127 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr) 2128 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr) 2129 2130 #define do_suqrshl_b(n, m) \ 2131 ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); }) 2132 #define do_suqrshl_h(n, m) \ 2133 ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); }) 2134 #define do_suqrshl_s(n, m) \ 2135 ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); }) 2136 #define do_suqrshl_d(n, m) \ 2137 ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); }) 2138 2139 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b) 2140 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h) 2141 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s) 2142 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d) 2143 2144 #undef DO_ASRD 2145 #undef DO_ZPZI 2146 #undef DO_ZPZI_D 2147 2148 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \ 2149 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2150 { \ 2151 intptr_t i, opr_sz = simd_oprsz(desc); \ 2152 int shift = simd_data(desc); \ 2153 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2154 TYPEW nn = *(TYPEW *)(vn + i); \ 2155 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \ 2156 } \ 2157 } 2158 2159 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 2160 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2161 { \ 2162 intptr_t i, opr_sz = simd_oprsz(desc); \ 2163 int shift = simd_data(desc); \ 2164 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2165 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2166 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \ 2167 } \ 2168 } 2169 2170 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR) 2171 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR) 2172 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR) 2173 2174 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR) 2175 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR) 2176 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR) 2177 2178 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr) 2179 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr) 2180 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr) 2181 2182 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr) 2183 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr) 2184 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr) 2185 2186 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX) 2187 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX) 2188 #define DO_SQSHRUN_D(x, sh) \ 2189 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX) 2190 2191 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H) 2192 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S) 2193 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D) 2194 2195 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H) 2196 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S) 2197 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D) 2198 2199 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX) 2200 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX) 2201 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX) 2202 2203 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H) 2204 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S) 2205 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D) 2206 2207 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H) 2208 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S) 2209 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D) 2210 2211 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX) 2212 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX) 2213 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX) 2214 2215 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H) 2216 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S) 2217 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D) 2218 2219 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H) 2220 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S) 2221 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D) 2222 2223 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX) 2224 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX) 2225 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX) 2226 2227 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H) 2228 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S) 2229 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D) 2230 2231 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H) 2232 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S) 2233 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D) 2234 2235 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX) 2236 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX) 2237 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX) 2238 2239 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H) 2240 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S) 2241 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D) 2242 2243 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H) 2244 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S) 2245 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D) 2246 2247 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX) 2248 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX) 2249 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX) 2250 2251 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H) 2252 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S) 2253 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D) 2254 2255 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H) 2256 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S) 2257 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D) 2258 2259 #undef DO_SHRNB 2260 #undef DO_SHRNT 2261 2262 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \ 2263 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2264 { \ 2265 intptr_t i, opr_sz = simd_oprsz(desc); \ 2266 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2267 TYPEW nn = *(TYPEW *)(vn + i); \ 2268 TYPEW mm = *(TYPEW *)(vm + i); \ 2269 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \ 2270 } \ 2271 } 2272 2273 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \ 2274 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2275 { \ 2276 intptr_t i, opr_sz = simd_oprsz(desc); \ 2277 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \ 2278 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 2279 TYPEW mm = *(TYPEW *)(vm + HW(i)); \ 2280 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \ 2281 } \ 2282 } 2283 2284 #define DO_ADDHN(N, M, SH) ((N + M) >> SH) 2285 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH) 2286 #define DO_SUBHN(N, M, SH) ((N - M) >> SH) 2287 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH) 2288 2289 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN) 2290 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN) 2291 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN) 2292 2293 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN) 2294 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN) 2295 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN) 2296 2297 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN) 2298 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN) 2299 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN) 2300 2301 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN) 2302 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN) 2303 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN) 2304 2305 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN) 2306 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN) 2307 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN) 2308 2309 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN) 2310 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN) 2311 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN) 2312 2313 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN) 2314 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN) 2315 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN) 2316 2317 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN) 2318 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN) 2319 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN) 2320 2321 #undef DO_RSUBHN 2322 #undef DO_SUBHN 2323 #undef DO_RADDHN 2324 #undef DO_ADDHN 2325 2326 #undef DO_BINOPNB 2327 2328 /* Fully general four-operand expander, controlled by a predicate. 2329 */ 2330 #define DO_ZPZZZ(NAME, TYPE, H, OP) \ 2331 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2332 void *vg, uint32_t desc) \ 2333 { \ 2334 intptr_t i, opr_sz = simd_oprsz(desc); \ 2335 for (i = 0; i < opr_sz; ) { \ 2336 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 2337 do { \ 2338 if (pg & 1) { \ 2339 TYPE nn = *(TYPE *)(vn + H(i)); \ 2340 TYPE mm = *(TYPE *)(vm + H(i)); \ 2341 TYPE aa = *(TYPE *)(va + H(i)); \ 2342 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ 2343 } \ 2344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 2345 } while (i & 15); \ 2346 } \ 2347 } 2348 2349 /* Similarly, specialized for 64-bit operands. */ 2350 #define DO_ZPZZZ_D(NAME, TYPE, OP) \ 2351 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ 2352 void *vg, uint32_t desc) \ 2353 { \ 2354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ 2355 TYPE *d = vd, *a = va, *n = vn, *m = vm; \ 2356 uint8_t *pg = vg; \ 2357 for (i = 0; i < opr_sz; i += 1) { \ 2358 if (pg[H1(i)] & 1) { \ 2359 TYPE aa = a[i], nn = n[i], mm = m[i]; \ 2360 d[i] = OP(aa, nn, mm); \ 2361 } \ 2362 } \ 2363 } 2364 2365 #define DO_MLA(A, N, M) (A + N * M) 2366 #define DO_MLS(A, N, M) (A - N * M) 2367 2368 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) 2369 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) 2370 2371 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) 2372 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) 2373 2374 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) 2375 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) 2376 2377 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) 2378 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) 2379 2380 #undef DO_MLA 2381 #undef DO_MLS 2382 #undef DO_ZPZZZ 2383 #undef DO_ZPZZZ_D 2384 2385 void HELPER(sve_index_b)(void *vd, uint32_t start, 2386 uint32_t incr, uint32_t desc) 2387 { 2388 intptr_t i, opr_sz = simd_oprsz(desc); 2389 uint8_t *d = vd; 2390 for (i = 0; i < opr_sz; i += 1) { 2391 d[H1(i)] = start + i * incr; 2392 } 2393 } 2394 2395 void HELPER(sve_index_h)(void *vd, uint32_t start, 2396 uint32_t incr, uint32_t desc) 2397 { 2398 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2399 uint16_t *d = vd; 2400 for (i = 0; i < opr_sz; i += 1) { 2401 d[H2(i)] = start + i * incr; 2402 } 2403 } 2404 2405 void HELPER(sve_index_s)(void *vd, uint32_t start, 2406 uint32_t incr, uint32_t desc) 2407 { 2408 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2409 uint32_t *d = vd; 2410 for (i = 0; i < opr_sz; i += 1) { 2411 d[H4(i)] = start + i * incr; 2412 } 2413 } 2414 2415 void HELPER(sve_index_d)(void *vd, uint64_t start, 2416 uint64_t incr, uint32_t desc) 2417 { 2418 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2419 uint64_t *d = vd; 2420 for (i = 0; i < opr_sz; i += 1) { 2421 d[i] = start + i * incr; 2422 } 2423 } 2424 2425 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc) 2426 { 2427 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2428 uint32_t sh = simd_data(desc); 2429 uint32_t *d = vd, *n = vn, *m = vm; 2430 for (i = 0; i < opr_sz; i += 1) { 2431 d[i] = n[i] + (m[i] << sh); 2432 } 2433 } 2434 2435 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc) 2436 { 2437 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2438 uint64_t sh = simd_data(desc); 2439 uint64_t *d = vd, *n = vn, *m = vm; 2440 for (i = 0; i < opr_sz; i += 1) { 2441 d[i] = n[i] + (m[i] << sh); 2442 } 2443 } 2444 2445 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc) 2446 { 2447 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2448 uint64_t sh = simd_data(desc); 2449 uint64_t *d = vd, *n = vn, *m = vm; 2450 for (i = 0; i < opr_sz; i += 1) { 2451 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh); 2452 } 2453 } 2454 2455 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc) 2456 { 2457 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2458 uint64_t sh = simd_data(desc); 2459 uint64_t *d = vd, *n = vn, *m = vm; 2460 for (i = 0; i < opr_sz; i += 1) { 2461 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh); 2462 } 2463 } 2464 2465 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc) 2466 { 2467 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2468 static const uint16_t coeff[] = { 2469 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8, 2470 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189, 2471 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295, 2472 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4, 2473 }; 2474 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2475 uint16_t *d = vd, *n = vn; 2476 2477 for (i = 0; i < opr_sz; i++) { 2478 uint16_t nn = n[i]; 2479 intptr_t idx = extract32(nn, 0, 5); 2480 uint16_t exp = extract32(nn, 5, 5); 2481 d[i] = coeff[idx] | (exp << 10); 2482 } 2483 } 2484 2485 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc) 2486 { 2487 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2488 static const uint32_t coeff[] = { 2489 0x000000, 0x0164d2, 0x02cd87, 0x043a29, 2490 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5, 2491 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc, 2492 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d, 2493 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda, 2494 0x1ef532, 0x20b051, 0x227043, 0x243516, 2495 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a, 2496 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4, 2497 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b, 2498 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd, 2499 0x45672a, 0x478d75, 0x49b9be, 0x4bec15, 2500 0x4e248c, 0x506334, 0x52a81e, 0x54f35b, 2501 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5, 2502 0x60ccdf, 0x633f89, 0x65b907, 0x68396a, 2503 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177, 2504 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c, 2505 }; 2506 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2507 uint32_t *d = vd, *n = vn; 2508 2509 for (i = 0; i < opr_sz; i++) { 2510 uint32_t nn = n[i]; 2511 intptr_t idx = extract32(nn, 0, 6); 2512 uint32_t exp = extract32(nn, 6, 8); 2513 d[i] = coeff[idx] | (exp << 23); 2514 } 2515 } 2516 2517 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) 2518 { 2519 /* These constants are cut-and-paste directly from the ARM pseudocode. */ 2520 static const uint64_t coeff[] = { 2521 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull, 2522 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull, 2523 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull, 2524 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull, 2525 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull, 2526 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull, 2527 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull, 2528 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull, 2529 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull, 2530 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull, 2531 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull, 2532 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull, 2533 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull, 2534 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull, 2535 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull, 2536 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull, 2537 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull, 2538 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull, 2539 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull, 2540 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull, 2541 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull, 2542 0xFA7C1819E90D8ull, 2543 }; 2544 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2545 uint64_t *d = vd, *n = vn; 2546 2547 for (i = 0; i < opr_sz; i++) { 2548 uint64_t nn = n[i]; 2549 intptr_t idx = extract32(nn, 0, 6); 2550 uint64_t exp = extract32(nn, 6, 11); 2551 d[i] = coeff[idx] | (exp << 52); 2552 } 2553 } 2554 2555 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) 2556 { 2557 intptr_t i, opr_sz = simd_oprsz(desc) / 2; 2558 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2559 uint16_t *d = vd, *n = vn, *m = vm; 2560 for (i = 0; i < opr_sz; i += 1) { 2561 uint16_t nn = n[i]; 2562 uint16_t mm = m[i]; 2563 if (mm & 1) { 2564 nn = float16_one; 2565 } 2566 if (mm & 2) { 2567 nn = float16_maybe_ah_chs(nn, fpcr_ah); 2568 } 2569 d[i] = nn; 2570 } 2571 } 2572 2573 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) 2574 { 2575 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 2576 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2577 uint32_t *d = vd, *n = vn, *m = vm; 2578 for (i = 0; i < opr_sz; i += 1) { 2579 uint32_t nn = n[i]; 2580 uint32_t mm = m[i]; 2581 if (mm & 1) { 2582 nn = float32_one; 2583 } 2584 if (mm & 2) { 2585 nn = float32_maybe_ah_chs(nn, fpcr_ah); 2586 } 2587 d[i] = nn; 2588 } 2589 } 2590 2591 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) 2592 { 2593 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2594 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); 2595 uint64_t *d = vd, *n = vn, *m = vm; 2596 for (i = 0; i < opr_sz; i += 1) { 2597 uint64_t nn = n[i]; 2598 uint64_t mm = m[i]; 2599 if (mm & 1) { 2600 nn = float64_one; 2601 } 2602 if (mm & 2) { 2603 nn = float64_maybe_ah_chs(nn, fpcr_ah); 2604 } 2605 d[i] = nn; 2606 } 2607 } 2608 2609 /* 2610 * Signed saturating addition with scalar operand. 2611 */ 2612 2613 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2614 { 2615 intptr_t i, oprsz = simd_oprsz(desc); 2616 2617 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 2618 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i)); 2619 } 2620 } 2621 2622 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2623 { 2624 intptr_t i, oprsz = simd_oprsz(desc); 2625 2626 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 2627 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i)); 2628 } 2629 } 2630 2631 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2632 { 2633 intptr_t i, oprsz = simd_oprsz(desc); 2634 2635 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 2636 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i)); 2637 } 2638 } 2639 2640 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc) 2641 { 2642 intptr_t i, oprsz = simd_oprsz(desc); 2643 2644 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 2645 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i)); 2646 } 2647 } 2648 2649 /* 2650 * Unsigned saturating addition with scalar operand. 2651 */ 2652 2653 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc) 2654 { 2655 intptr_t i, oprsz = simd_oprsz(desc); 2656 2657 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 2658 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i)); 2659 } 2660 } 2661 2662 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc) 2663 { 2664 intptr_t i, oprsz = simd_oprsz(desc); 2665 2666 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 2667 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i)); 2668 } 2669 } 2670 2671 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc) 2672 { 2673 intptr_t i, oprsz = simd_oprsz(desc); 2674 2675 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 2676 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i)); 2677 } 2678 } 2679 2680 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2681 { 2682 intptr_t i, oprsz = simd_oprsz(desc); 2683 2684 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2685 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i)); 2686 } 2687 } 2688 2689 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc) 2690 { 2691 intptr_t i, oprsz = simd_oprsz(desc); 2692 2693 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 2694 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b); 2695 } 2696 } 2697 2698 /* Two operand predicated copy immediate with merge. All valid immediates 2699 * can fit within 17 signed bits in the simd_data field. 2700 */ 2701 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg, 2702 uint64_t mm, uint32_t desc) 2703 { 2704 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2705 uint64_t *d = vd, *n = vn; 2706 uint8_t *pg = vg; 2707 2708 mm = dup_const(MO_8, mm); 2709 for (i = 0; i < opr_sz; i += 1) { 2710 uint64_t nn = n[i]; 2711 uint64_t pp = expand_pred_b(pg[H1(i)]); 2712 d[i] = (mm & pp) | (nn & ~pp); 2713 } 2714 } 2715 2716 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg, 2717 uint64_t mm, uint32_t desc) 2718 { 2719 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2720 uint64_t *d = vd, *n = vn; 2721 uint8_t *pg = vg; 2722 2723 mm = dup_const(MO_16, mm); 2724 for (i = 0; i < opr_sz; i += 1) { 2725 uint64_t nn = n[i]; 2726 uint64_t pp = expand_pred_h(pg[H1(i)]); 2727 d[i] = (mm & pp) | (nn & ~pp); 2728 } 2729 } 2730 2731 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg, 2732 uint64_t mm, uint32_t desc) 2733 { 2734 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2735 uint64_t *d = vd, *n = vn; 2736 uint8_t *pg = vg; 2737 2738 mm = dup_const(MO_32, mm); 2739 for (i = 0; i < opr_sz; i += 1) { 2740 uint64_t nn = n[i]; 2741 uint64_t pp = expand_pred_s(pg[H1(i)]); 2742 d[i] = (mm & pp) | (nn & ~pp); 2743 } 2744 } 2745 2746 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg, 2747 uint64_t mm, uint32_t desc) 2748 { 2749 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2750 uint64_t *d = vd, *n = vn; 2751 uint8_t *pg = vg; 2752 2753 for (i = 0; i < opr_sz; i += 1) { 2754 uint64_t nn = n[i]; 2755 d[i] = (pg[H1(i)] & 1 ? mm : nn); 2756 } 2757 } 2758 2759 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc) 2760 { 2761 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2762 uint64_t *d = vd; 2763 uint8_t *pg = vg; 2764 2765 val = dup_const(MO_8, val); 2766 for (i = 0; i < opr_sz; i += 1) { 2767 d[i] = val & expand_pred_b(pg[H1(i)]); 2768 } 2769 } 2770 2771 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc) 2772 { 2773 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2774 uint64_t *d = vd; 2775 uint8_t *pg = vg; 2776 2777 val = dup_const(MO_16, val); 2778 for (i = 0; i < opr_sz; i += 1) { 2779 d[i] = val & expand_pred_h(pg[H1(i)]); 2780 } 2781 } 2782 2783 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc) 2784 { 2785 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2786 uint64_t *d = vd; 2787 uint8_t *pg = vg; 2788 2789 val = dup_const(MO_32, val); 2790 for (i = 0; i < opr_sz; i += 1) { 2791 d[i] = val & expand_pred_s(pg[H1(i)]); 2792 } 2793 } 2794 2795 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc) 2796 { 2797 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2798 uint64_t *d = vd; 2799 uint8_t *pg = vg; 2800 2801 for (i = 0; i < opr_sz; i += 1) { 2802 d[i] = (pg[H1(i)] & 1 ? val : 0); 2803 } 2804 } 2805 2806 /* Big-endian hosts need to frob the byte indices. If the copy 2807 * happens to be 8-byte aligned, then no frobbing necessary. 2808 */ 2809 static void swap_memmove(void *vd, void *vs, size_t n) 2810 { 2811 uintptr_t d = (uintptr_t)vd; 2812 uintptr_t s = (uintptr_t)vs; 2813 uintptr_t o = (d | s | n) & 7; 2814 size_t i; 2815 2816 #if !HOST_BIG_ENDIAN 2817 o = 0; 2818 #endif 2819 switch (o) { 2820 case 0: 2821 memmove(vd, vs, n); 2822 break; 2823 2824 case 4: 2825 if (d < s || d >= s + n) { 2826 for (i = 0; i < n; i += 4) { 2827 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2828 } 2829 } else { 2830 for (i = n; i > 0; ) { 2831 i -= 4; 2832 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i); 2833 } 2834 } 2835 break; 2836 2837 case 2: 2838 case 6: 2839 if (d < s || d >= s + n) { 2840 for (i = 0; i < n; i += 2) { 2841 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2842 } 2843 } else { 2844 for (i = n; i > 0; ) { 2845 i -= 2; 2846 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i); 2847 } 2848 } 2849 break; 2850 2851 default: 2852 if (d < s || d >= s + n) { 2853 for (i = 0; i < n; i++) { 2854 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2855 } 2856 } else { 2857 for (i = n; i > 0; ) { 2858 i -= 1; 2859 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i); 2860 } 2861 } 2862 break; 2863 } 2864 } 2865 2866 /* Similarly for memset of 0. */ 2867 static void swap_memzero(void *vd, size_t n) 2868 { 2869 uintptr_t d = (uintptr_t)vd; 2870 uintptr_t o = (d | n) & 7; 2871 size_t i; 2872 2873 /* Usually, the first bit of a predicate is set, so N is 0. */ 2874 if (likely(n == 0)) { 2875 return; 2876 } 2877 2878 #if !HOST_BIG_ENDIAN 2879 o = 0; 2880 #endif 2881 switch (o) { 2882 case 0: 2883 memset(vd, 0, n); 2884 break; 2885 2886 case 4: 2887 for (i = 0; i < n; i += 4) { 2888 *(uint32_t *)H1_4(d + i) = 0; 2889 } 2890 break; 2891 2892 case 2: 2893 case 6: 2894 for (i = 0; i < n; i += 2) { 2895 *(uint16_t *)H1_2(d + i) = 0; 2896 } 2897 break; 2898 2899 default: 2900 for (i = 0; i < n; i++) { 2901 *(uint8_t *)H1(d + i) = 0; 2902 } 2903 break; 2904 } 2905 } 2906 2907 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc) 2908 { 2909 intptr_t opr_sz = simd_oprsz(desc); 2910 size_t n_ofs = simd_data(desc); 2911 size_t n_siz = opr_sz - n_ofs; 2912 2913 if (vd != vm) { 2914 swap_memmove(vd, vn + n_ofs, n_siz); 2915 swap_memmove(vd + n_siz, vm, n_ofs); 2916 } else if (vd != vn) { 2917 swap_memmove(vd + n_siz, vd, n_ofs); 2918 swap_memmove(vd, vn + n_ofs, n_siz); 2919 } else { 2920 /* vd == vn == vm. Need temp space. */ 2921 ARMVectorReg tmp; 2922 swap_memmove(&tmp, vm, n_ofs); 2923 swap_memmove(vd, vd + n_ofs, n_siz); 2924 memcpy(vd + n_siz, &tmp, n_ofs); 2925 } 2926 } 2927 2928 #define DO_INSR(NAME, TYPE, H) \ 2929 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \ 2930 { \ 2931 intptr_t opr_sz = simd_oprsz(desc); \ 2932 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \ 2933 *(TYPE *)(vd + H(0)) = val; \ 2934 } 2935 2936 DO_INSR(sve_insr_b, uint8_t, H1) 2937 DO_INSR(sve_insr_h, uint16_t, H1_2) 2938 DO_INSR(sve_insr_s, uint32_t, H1_4) 2939 DO_INSR(sve_insr_d, uint64_t, H1_8) 2940 2941 #undef DO_INSR 2942 2943 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc) 2944 { 2945 intptr_t i, j, opr_sz = simd_oprsz(desc); 2946 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2947 uint64_t f = *(uint64_t *)(vn + i); 2948 uint64_t b = *(uint64_t *)(vn + j); 2949 *(uint64_t *)(vd + i) = bswap64(b); 2950 *(uint64_t *)(vd + j) = bswap64(f); 2951 } 2952 } 2953 2954 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc) 2955 { 2956 intptr_t i, j, opr_sz = simd_oprsz(desc); 2957 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2958 uint64_t f = *(uint64_t *)(vn + i); 2959 uint64_t b = *(uint64_t *)(vn + j); 2960 *(uint64_t *)(vd + i) = hswap64(b); 2961 *(uint64_t *)(vd + j) = hswap64(f); 2962 } 2963 } 2964 2965 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc) 2966 { 2967 intptr_t i, j, opr_sz = simd_oprsz(desc); 2968 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2969 uint64_t f = *(uint64_t *)(vn + i); 2970 uint64_t b = *(uint64_t *)(vn + j); 2971 *(uint64_t *)(vd + i) = rol64(b, 32); 2972 *(uint64_t *)(vd + j) = rol64(f, 32); 2973 } 2974 } 2975 2976 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc) 2977 { 2978 intptr_t i, j, opr_sz = simd_oprsz(desc); 2979 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) { 2980 uint64_t f = *(uint64_t *)(vn + i); 2981 uint64_t b = *(uint64_t *)(vn + j); 2982 *(uint64_t *)(vd + i) = b; 2983 *(uint64_t *)(vd + j) = f; 2984 } 2985 } 2986 2987 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool); 2988 2989 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc, 2990 bool is_tbx, tb_impl_fn *fn) 2991 { 2992 ARMVectorReg scratch; 2993 uintptr_t oprsz = simd_oprsz(desc); 2994 2995 if (unlikely(vd == vn)) { 2996 vn = memcpy(&scratch, vn, oprsz); 2997 } 2998 2999 fn(vd, vn, NULL, vm, oprsz, is_tbx); 3000 } 3001 3002 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm, 3003 uint32_t desc, bool is_tbx, tb_impl_fn *fn) 3004 { 3005 ARMVectorReg scratch; 3006 uintptr_t oprsz = simd_oprsz(desc); 3007 3008 if (unlikely(vd == vn0)) { 3009 vn0 = memcpy(&scratch, vn0, oprsz); 3010 if (vd == vn1) { 3011 vn1 = vn0; 3012 } 3013 } else if (unlikely(vd == vn1)) { 3014 vn1 = memcpy(&scratch, vn1, oprsz); 3015 } 3016 3017 fn(vd, vn0, vn1, vm, oprsz, is_tbx); 3018 } 3019 3020 #define DO_TB(SUFF, TYPE, H) \ 3021 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \ 3022 void *vm, uintptr_t oprsz, bool is_tbx) \ 3023 { \ 3024 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \ 3025 uintptr_t i, nelem = oprsz / sizeof(TYPE); \ 3026 for (i = 0; i < nelem; ++i) { \ 3027 TYPE index = indexes[H1(i)], val = 0; \ 3028 if (index < nelem) { \ 3029 val = tbl0[H(index)]; \ 3030 } else { \ 3031 index -= nelem; \ 3032 if (tbl1 && index < nelem) { \ 3033 val = tbl1[H(index)]; \ 3034 } else if (is_tbx) { \ 3035 continue; \ 3036 } \ 3037 } \ 3038 d[H(i)] = val; \ 3039 } \ 3040 } \ 3041 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3042 { \ 3043 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \ 3044 } \ 3045 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \ 3046 void *vm, uint32_t desc) \ 3047 { \ 3048 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \ 3049 } \ 3050 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \ 3051 { \ 3052 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \ 3053 } 3054 3055 DO_TB(b, uint8_t, H1) 3056 DO_TB(h, uint16_t, H2) 3057 DO_TB(s, uint32_t, H4) 3058 DO_TB(d, uint64_t, H8) 3059 3060 #undef DO_TB 3061 3062 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \ 3063 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 3064 { \ 3065 intptr_t i, opr_sz = simd_oprsz(desc); \ 3066 TYPED *d = vd; \ 3067 TYPES *n = vn; \ 3068 ARMVectorReg tmp; \ 3069 if (unlikely(vn - vd < opr_sz)) { \ 3070 n = memcpy(&tmp, n, opr_sz / 2); \ 3071 } \ 3072 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \ 3073 d[HD(i)] = n[HS(i)]; \ 3074 } \ 3075 } 3076 3077 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1) 3078 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2) 3079 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4) 3080 3081 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1) 3082 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2) 3083 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4) 3084 3085 #undef DO_UNPK 3086 3087 /* Mask of bits included in the even numbered predicates of width esz. 3088 * We also use this for expand_bits/compress_bits, and so extend the 3089 * same pattern out to 16-bit units. 3090 */ 3091 static const uint64_t even_bit_esz_masks[5] = { 3092 0x5555555555555555ull, 3093 0x3333333333333333ull, 3094 0x0f0f0f0f0f0f0f0full, 3095 0x00ff00ff00ff00ffull, 3096 0x0000ffff0000ffffull, 3097 }; 3098 3099 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits. 3100 * For N==0, this corresponds to the operation that in qemu/bitops.h 3101 * we call half_shuffle64; this algorithm is from Hacker's Delight, 3102 * section 7-2 Shuffling Bits. 3103 */ 3104 static uint64_t expand_bits(uint64_t x, int n) 3105 { 3106 int i; 3107 3108 x &= 0xffffffffu; 3109 for (i = 4; i >= n; i--) { 3110 int sh = 1 << i; 3111 x = ((x << sh) | x) & even_bit_esz_masks[i]; 3112 } 3113 return x; 3114 } 3115 3116 /* Compress units of 2**(N+1) bits to units of 2**N bits. 3117 * For N==0, this corresponds to the operation that in qemu/bitops.h 3118 * we call half_unshuffle64; this algorithm is from Hacker's Delight, 3119 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle. 3120 */ 3121 static uint64_t compress_bits(uint64_t x, int n) 3122 { 3123 int i; 3124 3125 for (i = n; i <= 4; i++) { 3126 int sh = 1 << i; 3127 x &= even_bit_esz_masks[i]; 3128 x = (x >> sh) | x; 3129 } 3130 return x & 0xffffffffu; 3131 } 3132 3133 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3134 { 3135 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3136 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3137 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3138 int esize = 1 << esz; 3139 uint64_t *d = vd; 3140 intptr_t i; 3141 3142 if (oprsz <= 8) { 3143 uint64_t nn = *(uint64_t *)vn; 3144 uint64_t mm = *(uint64_t *)vm; 3145 int half = 4 * oprsz; 3146 3147 nn = extract64(nn, high * half, half); 3148 mm = extract64(mm, high * half, half); 3149 nn = expand_bits(nn, esz); 3150 mm = expand_bits(mm, esz); 3151 d[0] = nn | (mm << esize); 3152 } else { 3153 ARMPredicateReg tmp; 3154 3155 /* We produce output faster than we consume input. 3156 Therefore we must be mindful of possible overlap. */ 3157 if (vd == vn) { 3158 vn = memcpy(&tmp, vn, oprsz); 3159 if (vd == vm) { 3160 vm = vn; 3161 } 3162 } else if (vd == vm) { 3163 vm = memcpy(&tmp, vm, oprsz); 3164 } 3165 if (high) { 3166 high = oprsz >> 1; 3167 } 3168 3169 if ((oprsz & 7) == 0) { 3170 uint32_t *n = vn, *m = vm; 3171 high >>= 2; 3172 3173 for (i = 0; i < oprsz / 8; i++) { 3174 uint64_t nn = n[H4(high + i)]; 3175 uint64_t mm = m[H4(high + i)]; 3176 3177 nn = expand_bits(nn, esz); 3178 mm = expand_bits(mm, esz); 3179 d[i] = nn | (mm << esize); 3180 } 3181 } else { 3182 uint8_t *n = vn, *m = vm; 3183 uint16_t *d16 = vd; 3184 3185 for (i = 0; i < oprsz / 2; i++) { 3186 uint16_t nn = n[H1(high + i)]; 3187 uint16_t mm = m[H1(high + i)]; 3188 3189 nn = expand_bits(nn, esz); 3190 mm = expand_bits(mm, esz); 3191 d16[H2(i)] = nn | (mm << esize); 3192 } 3193 } 3194 } 3195 } 3196 3197 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3198 { 3199 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3200 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3201 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz; 3202 uint64_t *d = vd, *n = vn, *m = vm; 3203 uint64_t l, h; 3204 intptr_t i; 3205 3206 if (oprsz <= 8) { 3207 l = compress_bits(n[0] >> odd, esz); 3208 h = compress_bits(m[0] >> odd, esz); 3209 d[0] = l | (h << (4 * oprsz)); 3210 } else { 3211 ARMPredicateReg tmp_m; 3212 intptr_t oprsz_16 = oprsz / 16; 3213 3214 if ((vm - vd) < (uintptr_t)oprsz) { 3215 m = memcpy(&tmp_m, vm, oprsz); 3216 } 3217 3218 for (i = 0; i < oprsz_16; i++) { 3219 l = n[2 * i + 0]; 3220 h = n[2 * i + 1]; 3221 l = compress_bits(l >> odd, esz); 3222 h = compress_bits(h >> odd, esz); 3223 d[i] = l | (h << 32); 3224 } 3225 3226 /* 3227 * For VL which is not a multiple of 512, the results from M do not 3228 * align nicely with the uint64_t for D. Put the aligned results 3229 * from M into TMP_M and then copy it into place afterward. 3230 */ 3231 if (oprsz & 15) { 3232 int final_shift = (oprsz & 15) * 2; 3233 3234 l = n[2 * i + 0]; 3235 h = n[2 * i + 1]; 3236 l = compress_bits(l >> odd, esz); 3237 h = compress_bits(h >> odd, esz); 3238 d[i] = l | (h << final_shift); 3239 3240 for (i = 0; i < oprsz_16; i++) { 3241 l = m[2 * i + 0]; 3242 h = m[2 * i + 1]; 3243 l = compress_bits(l >> odd, esz); 3244 h = compress_bits(h >> odd, esz); 3245 tmp_m.p[i] = l | (h << 32); 3246 } 3247 l = m[2 * i + 0]; 3248 h = m[2 * i + 1]; 3249 l = compress_bits(l >> odd, esz); 3250 h = compress_bits(h >> odd, esz); 3251 tmp_m.p[i] = l | (h << final_shift); 3252 3253 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); 3254 } else { 3255 for (i = 0; i < oprsz_16; i++) { 3256 l = m[2 * i + 0]; 3257 h = m[2 * i + 1]; 3258 l = compress_bits(l >> odd, esz); 3259 h = compress_bits(h >> odd, esz); 3260 d[oprsz_16 + i] = l | (h << 32); 3261 } 3262 } 3263 } 3264 } 3265 3266 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) 3267 { 3268 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3269 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3270 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA); 3271 uint64_t *d = vd, *n = vn, *m = vm; 3272 uint64_t mask; 3273 int shr, shl; 3274 intptr_t i; 3275 3276 shl = 1 << esz; 3277 shr = 0; 3278 mask = even_bit_esz_masks[esz]; 3279 if (odd) { 3280 mask <<= shl; 3281 shr = shl; 3282 shl = 0; 3283 } 3284 3285 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) { 3286 uint64_t nn = (n[i] & mask) >> shr; 3287 uint64_t mm = (m[i] & mask) << shl; 3288 d[i] = nn + mm; 3289 } 3290 } 3291 3292 /* Reverse units of 2**N bits. */ 3293 static uint64_t reverse_bits_64(uint64_t x, int n) 3294 { 3295 int i, sh; 3296 3297 x = bswap64(x); 3298 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3299 uint64_t mask = even_bit_esz_masks[i]; 3300 x = ((x & mask) << sh) | ((x >> sh) & mask); 3301 } 3302 return x; 3303 } 3304 3305 static uint8_t reverse_bits_8(uint8_t x, int n) 3306 { 3307 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f }; 3308 int i, sh; 3309 3310 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) { 3311 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]); 3312 } 3313 return x; 3314 } 3315 3316 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc) 3317 { 3318 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3319 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3320 intptr_t i, oprsz_2 = oprsz / 2; 3321 3322 if (oprsz <= 8) { 3323 uint64_t l = *(uint64_t *)vn; 3324 l = reverse_bits_64(l << (64 - 8 * oprsz), esz); 3325 *(uint64_t *)vd = l; 3326 } else if ((oprsz & 15) == 0) { 3327 for (i = 0; i < oprsz_2; i += 8) { 3328 intptr_t ih = oprsz - 8 - i; 3329 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz); 3330 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz); 3331 *(uint64_t *)(vd + i) = h; 3332 *(uint64_t *)(vd + ih) = l; 3333 } 3334 } else { 3335 for (i = 0; i < oprsz_2; i += 1) { 3336 intptr_t il = H1(i); 3337 intptr_t ih = H1(oprsz - 1 - i); 3338 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz); 3339 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz); 3340 *(uint8_t *)(vd + il) = h; 3341 *(uint8_t *)(vd + ih) = l; 3342 } 3343 } 3344 } 3345 3346 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc) 3347 { 3348 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 3349 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA); 3350 uint64_t *d = vd; 3351 intptr_t i; 3352 3353 if (oprsz <= 8) { 3354 uint64_t nn = *(uint64_t *)vn; 3355 int half = 4 * oprsz; 3356 3357 nn = extract64(nn, high * half, half); 3358 nn = expand_bits(nn, 0); 3359 d[0] = nn; 3360 } else { 3361 ARMPredicateReg tmp_n; 3362 3363 /* We produce output faster than we consume input. 3364 Therefore we must be mindful of possible overlap. */ 3365 if ((vn - vd) < (uintptr_t)oprsz) { 3366 vn = memcpy(&tmp_n, vn, oprsz); 3367 } 3368 if (high) { 3369 high = oprsz >> 1; 3370 } 3371 3372 if ((oprsz & 7) == 0) { 3373 uint32_t *n = vn; 3374 high >>= 2; 3375 3376 for (i = 0; i < oprsz / 8; i++) { 3377 uint64_t nn = n[H4(high + i)]; 3378 d[i] = expand_bits(nn, 0); 3379 } 3380 } else { 3381 uint16_t *d16 = vd; 3382 uint8_t *n = vn; 3383 3384 for (i = 0; i < oprsz / 2; i++) { 3385 uint16_t nn = n[H1(high + i)]; 3386 d16[H2(i)] = expand_bits(nn, 0); 3387 } 3388 } 3389 } 3390 } 3391 3392 #define DO_ZIP(NAME, TYPE, H) \ 3393 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3394 { \ 3395 intptr_t oprsz = simd_oprsz(desc); \ 3396 intptr_t odd_ofs = simd_data(desc); \ 3397 intptr_t i, oprsz_2 = oprsz / 2; \ 3398 ARMVectorReg tmp_n, tmp_m; \ 3399 /* We produce output faster than we consume input. \ 3400 Therefore we must be mindful of possible overlap. */ \ 3401 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \ 3402 vn = memcpy(&tmp_n, vn, oprsz); \ 3403 } \ 3404 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3405 vm = memcpy(&tmp_m, vm, oprsz); \ 3406 } \ 3407 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \ 3408 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \ 3409 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \ 3410 *(TYPE *)(vm + odd_ofs + H(i)); \ 3411 } \ 3412 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3413 memset(vd + oprsz - 16, 0, 16); \ 3414 } \ 3415 } 3416 3417 DO_ZIP(sve_zip_b, uint8_t, H1) 3418 DO_ZIP(sve_zip_h, uint16_t, H1_2) 3419 DO_ZIP(sve_zip_s, uint32_t, H1_4) 3420 DO_ZIP(sve_zip_d, uint64_t, H1_8) 3421 DO_ZIP(sve2_zip_q, Int128, ) 3422 3423 #define DO_UZP(NAME, TYPE, H) \ 3424 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3425 { \ 3426 intptr_t oprsz = simd_oprsz(desc); \ 3427 intptr_t odd_ofs = simd_data(desc); \ 3428 intptr_t i, p; \ 3429 ARMVectorReg tmp_m; \ 3430 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \ 3431 vm = memcpy(&tmp_m, vm, oprsz); \ 3432 } \ 3433 i = 0, p = odd_ofs; \ 3434 do { \ 3435 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \ 3436 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3437 } while (p < oprsz); \ 3438 p -= oprsz; \ 3439 do { \ 3440 *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \ 3441 i += sizeof(TYPE), p += 2 * sizeof(TYPE); \ 3442 } while (p < oprsz); \ 3443 tcg_debug_assert(i == oprsz); \ 3444 } 3445 3446 DO_UZP(sve_uzp_b, uint8_t, H1) 3447 DO_UZP(sve_uzp_h, uint16_t, H1_2) 3448 DO_UZP(sve_uzp_s, uint32_t, H1_4) 3449 DO_UZP(sve_uzp_d, uint64_t, H1_8) 3450 DO_UZP(sve2_uzp_q, Int128, ) 3451 3452 #define DO_TRN(NAME, TYPE, H) \ 3453 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 3454 { \ 3455 intptr_t oprsz = simd_oprsz(desc); \ 3456 intptr_t odd_ofs = simd_data(desc); \ 3457 intptr_t i; \ 3458 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \ 3459 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \ 3460 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \ 3461 *(TYPE *)(vd + H(i + 0)) = ae; \ 3462 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \ 3463 } \ 3464 if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \ 3465 memset(vd + oprsz - 16, 0, 16); \ 3466 } \ 3467 } 3468 3469 DO_TRN(sve_trn_b, uint8_t, H1) 3470 DO_TRN(sve_trn_h, uint16_t, H1_2) 3471 DO_TRN(sve_trn_s, uint32_t, H1_4) 3472 DO_TRN(sve_trn_d, uint64_t, H1_8) 3473 DO_TRN(sve2_trn_q, Int128, ) 3474 3475 #undef DO_ZIP 3476 #undef DO_UZP 3477 #undef DO_TRN 3478 3479 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc) 3480 { 3481 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4; 3482 uint32_t *d = vd, *n = vn; 3483 uint8_t *pg = vg; 3484 3485 for (i = j = 0; i < opr_sz; i++) { 3486 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) { 3487 d[H4(j)] = n[H4(i)]; 3488 j++; 3489 } 3490 } 3491 for (; j < opr_sz; j++) { 3492 d[H4(j)] = 0; 3493 } 3494 } 3495 3496 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc) 3497 { 3498 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8; 3499 uint64_t *d = vd, *n = vn; 3500 uint8_t *pg = vg; 3501 3502 for (i = j = 0; i < opr_sz; i++) { 3503 if (pg[H1(i)] & 1) { 3504 d[j] = n[i]; 3505 j++; 3506 } 3507 } 3508 for (; j < opr_sz; j++) { 3509 d[j] = 0; 3510 } 3511 } 3512 3513 /* Similar to the ARM LastActiveElement pseudocode function, except the 3514 * result is multiplied by the element size. This includes the not found 3515 * indication; e.g. not found for esz=3 is -8. 3516 */ 3517 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc) 3518 { 3519 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 3520 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 3521 3522 return last_active_element(vg, words, esz); 3523 } 3524 3525 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 3526 { 3527 intptr_t opr_sz = simd_oprsz(desc) / 8; 3528 int esz = simd_data(desc); 3529 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz]; 3530 intptr_t i, first_i, last_i; 3531 ARMVectorReg tmp; 3532 3533 first_i = last_i = 0; 3534 first_g = last_g = 0; 3535 3536 /* Find the extent of the active elements within VG. */ 3537 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) { 3538 pg = *(uint64_t *)(vg + i) & mask; 3539 if (pg) { 3540 if (last_g == 0) { 3541 last_g = pg; 3542 last_i = i; 3543 } 3544 first_g = pg; 3545 first_i = i; 3546 } 3547 } 3548 3549 len = 0; 3550 if (first_g != 0) { 3551 first_i = first_i * 8 + ctz64(first_g); 3552 last_i = last_i * 8 + 63 - clz64(last_g); 3553 len = last_i - first_i + (1 << esz); 3554 if (vd == vm) { 3555 vm = memcpy(&tmp, vm, opr_sz * 8); 3556 } 3557 swap_memmove(vd, vn + first_i, len); 3558 } 3559 swap_memmove(vd + len, vm, opr_sz * 8 - len); 3560 } 3561 3562 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm, 3563 void *vg, uint32_t desc) 3564 { 3565 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3566 uint64_t *d = vd, *n = vn, *m = vm; 3567 uint8_t *pg = vg; 3568 3569 for (i = 0; i < opr_sz; i += 1) { 3570 uint64_t nn = n[i], mm = m[i]; 3571 uint64_t pp = expand_pred_b(pg[H1(i)]); 3572 d[i] = (nn & pp) | (mm & ~pp); 3573 } 3574 } 3575 3576 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm, 3577 void *vg, uint32_t desc) 3578 { 3579 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3580 uint64_t *d = vd, *n = vn, *m = vm; 3581 uint8_t *pg = vg; 3582 3583 for (i = 0; i < opr_sz; i += 1) { 3584 uint64_t nn = n[i], mm = m[i]; 3585 uint64_t pp = expand_pred_h(pg[H1(i)]); 3586 d[i] = (nn & pp) | (mm & ~pp); 3587 } 3588 } 3589 3590 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm, 3591 void *vg, uint32_t desc) 3592 { 3593 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3594 uint64_t *d = vd, *n = vn, *m = vm; 3595 uint8_t *pg = vg; 3596 3597 for (i = 0; i < opr_sz; i += 1) { 3598 uint64_t nn = n[i], mm = m[i]; 3599 uint64_t pp = expand_pred_s(pg[H1(i)]); 3600 d[i] = (nn & pp) | (mm & ~pp); 3601 } 3602 } 3603 3604 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm, 3605 void *vg, uint32_t desc) 3606 { 3607 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 3608 uint64_t *d = vd, *n = vn, *m = vm; 3609 uint8_t *pg = vg; 3610 3611 for (i = 0; i < opr_sz; i += 1) { 3612 uint64_t nn = n[i], mm = m[i]; 3613 d[i] = (pg[H1(i)] & 1 ? nn : mm); 3614 } 3615 } 3616 3617 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm, 3618 void *vg, uint32_t desc) 3619 { 3620 intptr_t i, opr_sz = simd_oprsz(desc) / 16; 3621 Int128 *d = vd, *n = vn, *m = vm; 3622 uint16_t *pg = vg; 3623 3624 for (i = 0; i < opr_sz; i += 1) { 3625 d[i] = (pg[H2(i)] & 1 ? n : m)[i]; 3626 } 3627 } 3628 3629 /* Two operand comparison controlled by a predicate. 3630 * ??? It is very tempting to want to be able to expand this inline 3631 * with x86 instructions, e.g. 3632 * 3633 * vcmpeqw zm, zn, %ymm0 3634 * vpmovmskb %ymm0, %eax 3635 * and $0x5555, %eax 3636 * and pg, %eax 3637 * 3638 * or even aarch64, e.g. 3639 * 3640 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001 3641 * cmeq v0.8h, zn, zm 3642 * and v0.8h, v0.8h, mask 3643 * addv h0, v0.8h 3644 * and v0.8b, pg 3645 * 3646 * However, coming up with an abstraction that allows vector inputs and 3647 * a scalar output, and also handles the byte-ordering of sub-uint64_t 3648 * scalar outputs, is tricky. 3649 */ 3650 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \ 3651 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3652 { \ 3653 intptr_t opr_sz = simd_oprsz(desc); \ 3654 uint32_t flags = PREDTEST_INIT; \ 3655 intptr_t i = opr_sz; \ 3656 do { \ 3657 uint64_t out = 0, pg; \ 3658 do { \ 3659 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3660 TYPE nn = *(TYPE *)(vn + H(i)); \ 3661 TYPE mm = *(TYPE *)(vm + H(i)); \ 3662 out |= nn OP mm; \ 3663 } while (i & 63); \ 3664 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3665 out &= pg; \ 3666 *(uint64_t *)(vd + (i >> 3)) = out; \ 3667 flags = iter_predtest_bwd(out, pg, flags); \ 3668 } while (i > 0); \ 3669 return flags; \ 3670 } 3671 3672 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \ 3673 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3674 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \ 3675 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3676 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \ 3677 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3678 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \ 3679 DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3680 3681 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==) 3682 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==) 3683 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==) 3684 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==) 3685 3686 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=) 3687 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=) 3688 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=) 3689 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=) 3690 3691 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >) 3692 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >) 3693 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >) 3694 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >) 3695 3696 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=) 3697 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=) 3698 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=) 3699 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=) 3700 3701 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >) 3702 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >) 3703 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >) 3704 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >) 3705 3706 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=) 3707 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=) 3708 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=) 3709 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=) 3710 3711 #undef DO_CMP_PPZZ_B 3712 #undef DO_CMP_PPZZ_H 3713 #undef DO_CMP_PPZZ_S 3714 #undef DO_CMP_PPZZ_D 3715 #undef DO_CMP_PPZZ 3716 3717 /* Similar, but the second source is "wide". */ 3718 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \ 3719 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 3720 { \ 3721 intptr_t opr_sz = simd_oprsz(desc); \ 3722 uint32_t flags = PREDTEST_INIT; \ 3723 intptr_t i = opr_sz; \ 3724 do { \ 3725 uint64_t out = 0, pg; \ 3726 do { \ 3727 TYPEW mm = *(TYPEW *)(vm + i - 8); \ 3728 do { \ 3729 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3730 TYPE nn = *(TYPE *)(vn + H(i)); \ 3731 out |= nn OP mm; \ 3732 } while (i & 7); \ 3733 } while (i & 63); \ 3734 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3735 out &= pg; \ 3736 *(uint64_t *)(vd + (i >> 3)) = out; \ 3737 flags = iter_predtest_bwd(out, pg, flags); \ 3738 } while (i > 0); \ 3739 return flags; \ 3740 } 3741 3742 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \ 3743 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull) 3744 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \ 3745 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull) 3746 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \ 3747 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull) 3748 3749 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==) 3750 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==) 3751 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==) 3752 3753 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=) 3754 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=) 3755 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=) 3756 3757 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >) 3758 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >) 3759 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >) 3760 3761 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=) 3762 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=) 3763 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=) 3764 3765 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >) 3766 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >) 3767 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >) 3768 3769 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=) 3770 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=) 3771 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=) 3772 3773 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <) 3774 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <) 3775 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <) 3776 3777 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=) 3778 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=) 3779 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=) 3780 3781 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <) 3782 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <) 3783 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <) 3784 3785 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=) 3786 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=) 3787 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=) 3788 3789 #undef DO_CMP_PPZW_B 3790 #undef DO_CMP_PPZW_H 3791 #undef DO_CMP_PPZW_S 3792 #undef DO_CMP_PPZW 3793 3794 /* Similar, but the second source is immediate. */ 3795 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \ 3796 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ 3797 { \ 3798 intptr_t opr_sz = simd_oprsz(desc); \ 3799 uint32_t flags = PREDTEST_INIT; \ 3800 TYPE mm = simd_data(desc); \ 3801 intptr_t i = opr_sz; \ 3802 do { \ 3803 uint64_t out = 0, pg; \ 3804 do { \ 3805 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 3806 TYPE nn = *(TYPE *)(vn + H(i)); \ 3807 out |= nn OP mm; \ 3808 } while (i & 63); \ 3809 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \ 3810 out &= pg; \ 3811 *(uint64_t *)(vd + (i >> 3)) = out; \ 3812 flags = iter_predtest_bwd(out, pg, flags); \ 3813 } while (i > 0); \ 3814 return flags; \ 3815 } 3816 3817 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \ 3818 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull) 3819 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \ 3820 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull) 3821 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \ 3822 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull) 3823 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \ 3824 DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull) 3825 3826 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==) 3827 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==) 3828 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==) 3829 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==) 3830 3831 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=) 3832 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=) 3833 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=) 3834 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=) 3835 3836 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >) 3837 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >) 3838 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >) 3839 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >) 3840 3841 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=) 3842 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=) 3843 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=) 3844 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=) 3845 3846 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >) 3847 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >) 3848 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >) 3849 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >) 3850 3851 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=) 3852 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=) 3853 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=) 3854 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=) 3855 3856 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <) 3857 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <) 3858 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <) 3859 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <) 3860 3861 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=) 3862 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=) 3863 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=) 3864 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=) 3865 3866 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <) 3867 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <) 3868 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <) 3869 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <) 3870 3871 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=) 3872 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=) 3873 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=) 3874 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=) 3875 3876 #undef DO_CMP_PPZI_B 3877 #undef DO_CMP_PPZI_H 3878 #undef DO_CMP_PPZI_S 3879 #undef DO_CMP_PPZI_D 3880 #undef DO_CMP_PPZI 3881 3882 /* Similar to the ARM LastActive pseudocode function. */ 3883 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz) 3884 { 3885 intptr_t i; 3886 3887 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) { 3888 uint64_t pg = *(uint64_t *)(vg + i); 3889 if (pg) { 3890 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0; 3891 } 3892 } 3893 return 0; 3894 } 3895 3896 /* Compute a mask into RETB that is true for all G, up to and including 3897 * (if after) or excluding (if !after) the first G & N. 3898 * Return true if BRK found. 3899 */ 3900 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g, 3901 bool brk, bool after) 3902 { 3903 uint64_t b; 3904 3905 if (brk) { 3906 b = 0; 3907 } else if ((g & n) == 0) { 3908 /* For all G, no N are set; break not found. */ 3909 b = g; 3910 } else { 3911 /* Break somewhere in N. Locate it. */ 3912 b = g & n; /* guard true, pred true */ 3913 b = b & -b; /* first such */ 3914 if (after) { 3915 b = b | (b - 1); /* break after same */ 3916 } else { 3917 b = b - 1; /* break before same */ 3918 } 3919 brk = true; 3920 } 3921 3922 *retb = b; 3923 return brk; 3924 } 3925 3926 /* Compute a zeroing BRK. */ 3927 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g, 3928 intptr_t oprsz, bool after) 3929 { 3930 bool brk = false; 3931 intptr_t i; 3932 3933 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3934 uint64_t this_b, this_g = g[i]; 3935 3936 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3937 d[i] = this_b & this_g; 3938 } 3939 } 3940 3941 /* Likewise, but also compute flags. */ 3942 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g, 3943 intptr_t oprsz, bool after) 3944 { 3945 uint32_t flags = PREDTEST_INIT; 3946 bool brk = false; 3947 intptr_t i; 3948 3949 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3950 uint64_t this_b, this_d, this_g = g[i]; 3951 3952 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3953 d[i] = this_d = this_b & this_g; 3954 flags = iter_predtest_fwd(this_d, this_g, flags); 3955 } 3956 return flags; 3957 } 3958 3959 /* Compute a merging BRK. */ 3960 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g, 3961 intptr_t oprsz, bool after) 3962 { 3963 bool brk = false; 3964 intptr_t i; 3965 3966 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) { 3967 uint64_t this_b, this_g = g[i]; 3968 3969 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3970 d[i] = (this_b & this_g) | (d[i] & ~this_g); 3971 } 3972 } 3973 3974 /* Likewise, but also compute flags. */ 3975 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g, 3976 intptr_t oprsz, bool after) 3977 { 3978 uint32_t flags = PREDTEST_INIT; 3979 bool brk = false; 3980 intptr_t i; 3981 3982 for (i = 0; i < oprsz / 8; ++i) { 3983 uint64_t this_b, this_d = d[i], this_g = g[i]; 3984 3985 brk = compute_brk(&this_b, n[i], this_g, brk, after); 3986 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g); 3987 flags = iter_predtest_fwd(this_d, this_g, flags); 3988 } 3989 return flags; 3990 } 3991 3992 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz) 3993 { 3994 /* It is quicker to zero the whole predicate than loop on OPRSZ. 3995 * The compiler should turn this into 4 64-bit integer stores. 3996 */ 3997 memset(d, 0, sizeof(ARMPredicateReg)); 3998 return PREDTEST_INIT; 3999 } 4000 4001 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg, 4002 uint32_t pred_desc) 4003 { 4004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4005 if (last_active_pred(vn, vg, oprsz)) { 4006 compute_brk_z(vd, vm, vg, oprsz, true); 4007 } else { 4008 do_zero(vd, oprsz); 4009 } 4010 } 4011 4012 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg, 4013 uint32_t pred_desc) 4014 { 4015 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4016 if (last_active_pred(vn, vg, oprsz)) { 4017 return compute_brks_z(vd, vm, vg, oprsz, true); 4018 } else { 4019 return do_zero(vd, oprsz); 4020 } 4021 } 4022 4023 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg, 4024 uint32_t pred_desc) 4025 { 4026 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4027 if (last_active_pred(vn, vg, oprsz)) { 4028 compute_brk_z(vd, vm, vg, oprsz, false); 4029 } else { 4030 do_zero(vd, oprsz); 4031 } 4032 } 4033 4034 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg, 4035 uint32_t pred_desc) 4036 { 4037 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4038 if (last_active_pred(vn, vg, oprsz)) { 4039 return compute_brks_z(vd, vm, vg, oprsz, false); 4040 } else { 4041 return do_zero(vd, oprsz); 4042 } 4043 } 4044 4045 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4046 { 4047 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4048 compute_brk_z(vd, vn, vg, oprsz, true); 4049 } 4050 4051 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4052 { 4053 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4054 return compute_brks_z(vd, vn, vg, oprsz, true); 4055 } 4056 4057 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4058 { 4059 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4060 compute_brk_z(vd, vn, vg, oprsz, false); 4061 } 4062 4063 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4064 { 4065 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4066 return compute_brks_z(vd, vn, vg, oprsz, false); 4067 } 4068 4069 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4070 { 4071 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4072 compute_brk_m(vd, vn, vg, oprsz, true); 4073 } 4074 4075 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4076 { 4077 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4078 return compute_brks_m(vd, vn, vg, oprsz, true); 4079 } 4080 4081 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4082 { 4083 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4084 compute_brk_m(vd, vn, vg, oprsz, false); 4085 } 4086 4087 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4088 { 4089 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4090 return compute_brks_m(vd, vn, vg, oprsz, false); 4091 } 4092 4093 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4094 { 4095 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4096 if (!last_active_pred(vn, vg, oprsz)) { 4097 do_zero(vd, oprsz); 4098 } 4099 } 4100 4101 /* As if PredTest(Ones(PL), D, esz). */ 4102 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz, 4103 uint64_t esz_mask) 4104 { 4105 uint32_t flags = PREDTEST_INIT; 4106 intptr_t i; 4107 4108 for (i = 0; i < oprsz / 8; i++) { 4109 flags = iter_predtest_fwd(d->p[i], esz_mask, flags); 4110 } 4111 if (oprsz & 7) { 4112 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7))); 4113 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags); 4114 } 4115 return flags; 4116 } 4117 4118 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc) 4119 { 4120 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4121 if (last_active_pred(vn, vg, oprsz)) { 4122 return predtest_ones(vd, oprsz, -1); 4123 } else { 4124 return do_zero(vd, oprsz); 4125 } 4126 } 4127 4128 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc) 4129 { 4130 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8); 4131 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4132 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz]; 4133 intptr_t i; 4134 4135 for (i = 0; i < words; ++i) { 4136 uint64_t t = n[i] & g[i] & mask; 4137 sum += ctpop64(t); 4138 } 4139 return sum; 4140 } 4141 4142 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc) 4143 { 4144 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4145 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4146 uint64_t esz_mask = pred_esz_masks[esz]; 4147 ARMPredicateReg *d = vd; 4148 uint32_t flags; 4149 intptr_t i; 4150 4151 /* Begin with a zero predicate register. */ 4152 flags = do_zero(d, oprsz); 4153 if (count == 0) { 4154 return flags; 4155 } 4156 4157 /* Set all of the requested bits. */ 4158 for (i = 0; i < count / 64; ++i) { 4159 d->p[i] = esz_mask; 4160 } 4161 if (count & 63) { 4162 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask; 4163 } 4164 4165 return predtest_ones(d, oprsz, esz_mask); 4166 } 4167 4168 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc) 4169 { 4170 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ); 4171 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ); 4172 uint64_t esz_mask = pred_esz_masks[esz]; 4173 ARMPredicateReg *d = vd; 4174 intptr_t i, invcount, oprbits; 4175 uint64_t bits; 4176 4177 if (count == 0) { 4178 return do_zero(d, oprsz); 4179 } 4180 4181 oprbits = oprsz * 8; 4182 tcg_debug_assert(count <= oprbits); 4183 4184 bits = esz_mask; 4185 if (oprbits & 63) { 4186 bits &= MAKE_64BIT_MASK(0, oprbits & 63); 4187 } 4188 4189 invcount = oprbits - count; 4190 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) { 4191 d->p[i] = bits; 4192 bits = esz_mask; 4193 } 4194 4195 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64); 4196 4197 while (--i >= 0) { 4198 d->p[i] = 0; 4199 } 4200 4201 return predtest_ones(d, oprsz, esz_mask); 4202 } 4203 4204 /* Recursive reduction on a function; 4205 * C.f. the ARM ARM function ReducePredicated. 4206 * 4207 * While it would be possible to write this without the DATA temporary, 4208 * it is much simpler to process the predicate register this way. 4209 * The recursion is bounded to depth 7 (128 fp16 elements), so there's 4210 * little to gain with a more complex non-recursive form. 4211 */ 4212 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ 4213 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ 4214 { \ 4215 if (n == 1) { \ 4216 return *data; \ 4217 } else { \ 4218 uintptr_t half = n / 2; \ 4219 TYPE lo = NAME##_reduce(data, status, half); \ 4220 TYPE hi = NAME##_reduce(data + half, status, half); \ 4221 return FUNC(lo, hi, status); \ 4222 } \ 4223 } \ 4224 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ 4225 { \ 4226 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \ 4227 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ 4228 for (i = 0; i < oprsz; ) { \ 4229 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ 4230 do { \ 4231 TYPE nn = *(TYPE *)(vn + H(i)); \ 4232 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ 4233 i += sizeof(TYPE), pg >>= sizeof(TYPE); \ 4234 } while (i & 15); \ 4235 } \ 4236 for (; i < maxsz; i += sizeof(TYPE)) { \ 4237 *(TYPE *)((void *)data + i) = IDENT; \ 4238 } \ 4239 return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ 4240 } 4241 4242 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero) 4243 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero) 4244 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero) 4245 4246 /* Identity is floatN_default_nan, without the function call. */ 4247 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00) 4248 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000) 4249 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) 4250 4251 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00) 4252 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000) 4253 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) 4254 4255 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity) 4256 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity) 4257 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity) 4258 4259 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity)) 4260 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity)) 4261 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity)) 4262 4263 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) 4264 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) 4265 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) 4266 4267 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh, 4268 float16_chs(float16_infinity)) 4269 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs, 4270 float32_chs(float32_infinity)) 4271 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd, 4272 float64_chs(float64_infinity)) 4273 4274 #undef DO_REDUCE 4275 4276 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, 4277 float_status *status, uint32_t desc) 4278 { 4279 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4280 float16 result = nn; 4281 4282 do { 4283 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4284 do { 4285 if (pg & 1) { 4286 float16 mm = *(float16 *)(vm + H1_2(i)); 4287 result = float16_add(result, mm, status); 4288 } 4289 i += sizeof(float16), pg >>= sizeof(float16); 4290 } while (i & 15); 4291 } while (i < opr_sz); 4292 4293 return result; 4294 } 4295 4296 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, 4297 float_status *status, uint32_t desc) 4298 { 4299 intptr_t i = 0, opr_sz = simd_oprsz(desc); 4300 float32 result = nn; 4301 4302 do { 4303 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); 4304 do { 4305 if (pg & 1) { 4306 float32 mm = *(float32 *)(vm + H1_2(i)); 4307 result = float32_add(result, mm, status); 4308 } 4309 i += sizeof(float32), pg >>= sizeof(float32); 4310 } while (i & 15); 4311 } while (i < opr_sz); 4312 4313 return result; 4314 } 4315 4316 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, 4317 float_status *status, uint32_t desc) 4318 { 4319 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; 4320 uint64_t *m = vm; 4321 uint8_t *pg = vg; 4322 4323 for (i = 0; i < opr_sz; i++) { 4324 if (pg[H1(i)] & 1) { 4325 nn = float64_add(nn, m[i], status); 4326 } 4327 } 4328 4329 return nn; 4330 } 4331 4332 /* Fully general three-operand expander, controlled by a predicate, 4333 * With the extra float_status parameter. 4334 */ 4335 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ 4336 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 4337 float_status *status, uint32_t desc) \ 4338 { \ 4339 intptr_t i = simd_oprsz(desc); \ 4340 uint64_t *g = vg; \ 4341 do { \ 4342 uint64_t pg = g[(i - 1) >> 6]; \ 4343 do { \ 4344 i -= sizeof(TYPE); \ 4345 if (likely((pg >> (i & 63)) & 1)) { \ 4346 TYPE nn = *(TYPE *)(vn + H(i)); \ 4347 TYPE mm = *(TYPE *)(vm + H(i)); \ 4348 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4349 } \ 4350 } while (i & 63); \ 4351 } while (i != 0); \ 4352 } 4353 4354 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) 4355 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) 4356 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add) 4357 4358 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) 4359 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) 4360 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub) 4361 4362 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) 4363 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) 4364 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul) 4365 4366 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) 4367 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) 4368 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div) 4369 4370 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) 4371 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) 4372 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min) 4373 4374 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) 4375 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) 4376 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) 4377 4378 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) 4379 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) 4380 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) 4381 4382 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) 4383 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) 4384 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) 4385 4386 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) 4387 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) 4388 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) 4389 4390 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) 4391 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) 4392 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum) 4393 4394 static inline float16 abd_h(float16 a, float16 b, float_status *s) 4395 { 4396 return float16_abs(float16_sub(a, b, s)); 4397 } 4398 4399 static inline float32 abd_s(float32 a, float32 b, float_status *s) 4400 { 4401 return float32_abs(float32_sub(a, b, s)); 4402 } 4403 4404 static inline float64 abd_d(float64 a, float64 b, float_status *s) 4405 { 4406 return float64_abs(float64_sub(a, b, s)); 4407 } 4408 4409 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 4410 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat) 4411 { 4412 float16 r = float16_sub(op1, op2, stat); 4413 return float16_is_any_nan(r) ? r : float16_abs(r); 4414 } 4415 4416 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat) 4417 { 4418 float32 r = float32_sub(op1, op2, stat); 4419 return float32_is_any_nan(r) ? r : float32_abs(r); 4420 } 4421 4422 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat) 4423 { 4424 float64 r = float64_sub(op1, op2, stat); 4425 return float64_is_any_nan(r) ? r : float64_abs(r); 4426 } 4427 4428 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) 4429 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) 4430 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) 4431 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h) 4432 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s) 4433 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d) 4434 4435 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) 4436 { 4437 int b_int = MIN(MAX(b, INT_MIN), INT_MAX); 4438 return float64_scalbn(a, b_int, s); 4439 } 4440 4441 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) 4442 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) 4443 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d) 4444 4445 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) 4446 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) 4447 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd) 4448 4449 #undef DO_ZPZZ_FP 4450 4451 /* Three-operand expander, with one scalar operand, controlled by 4452 * a predicate, with the extra float_status parameter. 4453 */ 4454 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \ 4455 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ 4456 float_status *status, uint32_t desc) \ 4457 { \ 4458 intptr_t i = simd_oprsz(desc); \ 4459 uint64_t *g = vg; \ 4460 TYPE mm = scalar; \ 4461 do { \ 4462 uint64_t pg = g[(i - 1) >> 6]; \ 4463 do { \ 4464 i -= sizeof(TYPE); \ 4465 if (likely((pg >> (i & 63)) & 1)) { \ 4466 TYPE nn = *(TYPE *)(vn + H(i)); \ 4467 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ 4468 } \ 4469 } while (i & 63); \ 4470 } while (i != 0); \ 4471 } 4472 4473 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) 4474 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) 4475 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add) 4476 4477 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) 4478 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) 4479 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub) 4480 4481 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) 4482 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) 4483 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul) 4484 4485 static inline float16 subr_h(float16 a, float16 b, float_status *s) 4486 { 4487 return float16_sub(b, a, s); 4488 } 4489 4490 static inline float32 subr_s(float32 a, float32 b, float_status *s) 4491 { 4492 return float32_sub(b, a, s); 4493 } 4494 4495 static inline float64 subr_d(float64 a, float64 b, float_status *s) 4496 { 4497 return float64_sub(b, a, s); 4498 } 4499 4500 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) 4501 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) 4502 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d) 4503 4504 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) 4505 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) 4506 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum) 4507 4508 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) 4509 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) 4510 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum) 4511 4512 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) 4513 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) 4514 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max) 4515 4516 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) 4517 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) 4518 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) 4519 4520 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh) 4521 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs) 4522 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd) 4523 4524 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh) 4525 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins) 4526 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind) 4527 4528 /* Fully general two-operand expander, controlled by a predicate, 4529 * With the extra float_status parameter. 4530 */ 4531 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \ 4532 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 4533 float_status *status, uint32_t desc) \ 4534 { \ 4535 intptr_t i = simd_oprsz(desc); \ 4536 uint64_t *g = vg; \ 4537 do { \ 4538 uint64_t pg = g[(i - 1) >> 6]; \ 4539 do { \ 4540 i -= sizeof(TYPE); \ 4541 if (likely((pg >> (i & 63)) & 1)) { \ 4542 TYPE nn = *(TYPE *)(vn + H(i)); \ 4543 *(TYPE *)(vd + H(i)) = OP(nn, status); \ 4544 } \ 4545 } while (i & 63); \ 4546 } while (i != 0); \ 4547 } 4548 4549 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore 4550 * FZ16. When converting from fp16, this affects flushing input denormals; 4551 * when converting to fp16, this affects flushing output denormals. 4552 */ 4553 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) 4554 { 4555 bool save = get_flush_inputs_to_zero(fpst); 4556 float32 ret; 4557 4558 set_flush_inputs_to_zero(false, fpst); 4559 ret = float16_to_float32(f, true, fpst); 4560 set_flush_inputs_to_zero(save, fpst); 4561 return ret; 4562 } 4563 4564 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) 4565 { 4566 bool save = get_flush_inputs_to_zero(fpst); 4567 float64 ret; 4568 4569 set_flush_inputs_to_zero(false, fpst); 4570 ret = float16_to_float64(f, true, fpst); 4571 set_flush_inputs_to_zero(save, fpst); 4572 return ret; 4573 } 4574 4575 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) 4576 { 4577 bool save = get_flush_to_zero(fpst); 4578 float16 ret; 4579 4580 set_flush_to_zero(false, fpst); 4581 ret = float32_to_float16(f, true, fpst); 4582 set_flush_to_zero(save, fpst); 4583 return ret; 4584 } 4585 4586 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) 4587 { 4588 bool save = get_flush_to_zero(fpst); 4589 float16 ret; 4590 4591 set_flush_to_zero(false, fpst); 4592 ret = float64_to_float16(f, true, fpst); 4593 set_flush_to_zero(save, fpst); 4594 return ret; 4595 } 4596 4597 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) 4598 { 4599 if (float16_is_any_nan(f)) { 4600 float_raise(float_flag_invalid, s); 4601 return 0; 4602 } 4603 return float16_to_int16_round_to_zero(f, s); 4604 } 4605 4606 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) 4607 { 4608 if (float16_is_any_nan(f)) { 4609 float_raise(float_flag_invalid, s); 4610 return 0; 4611 } 4612 return float16_to_int64_round_to_zero(f, s); 4613 } 4614 4615 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) 4616 { 4617 if (float32_is_any_nan(f)) { 4618 float_raise(float_flag_invalid, s); 4619 return 0; 4620 } 4621 return float32_to_int64_round_to_zero(f, s); 4622 } 4623 4624 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) 4625 { 4626 if (float64_is_any_nan(f)) { 4627 float_raise(float_flag_invalid, s); 4628 return 0; 4629 } 4630 return float64_to_int64_round_to_zero(f, s); 4631 } 4632 4633 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) 4634 { 4635 if (float16_is_any_nan(f)) { 4636 float_raise(float_flag_invalid, s); 4637 return 0; 4638 } 4639 return float16_to_uint16_round_to_zero(f, s); 4640 } 4641 4642 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) 4643 { 4644 if (float16_is_any_nan(f)) { 4645 float_raise(float_flag_invalid, s); 4646 return 0; 4647 } 4648 return float16_to_uint64_round_to_zero(f, s); 4649 } 4650 4651 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) 4652 { 4653 if (float32_is_any_nan(f)) { 4654 float_raise(float_flag_invalid, s); 4655 return 0; 4656 } 4657 return float32_to_uint64_round_to_zero(f, s); 4658 } 4659 4660 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) 4661 { 4662 if (float64_is_any_nan(f)) { 4663 float_raise(float_flag_invalid, s); 4664 return 0; 4665 } 4666 return float64_to_uint64_round_to_zero(f, s); 4667 } 4668 4669 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) 4670 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) 4671 DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16) 4672 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16) 4673 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64) 4674 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32) 4675 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64) 4676 4677 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) 4678 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) 4679 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) 4680 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz) 4681 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz) 4682 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd) 4683 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz) 4684 4685 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) 4686 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) 4687 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) 4688 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz) 4689 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz) 4690 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd) 4691 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz) 4692 4693 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) 4694 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) 4695 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd) 4696 4697 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) 4698 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) 4699 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int) 4700 4701 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) 4702 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) 4703 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64) 4704 4705 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) 4706 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) 4707 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt) 4708 4709 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) 4710 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) 4711 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) 4712 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64) 4713 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16) 4714 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32) 4715 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64) 4716 4717 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) 4718 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) 4719 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) 4720 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64) 4721 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16) 4722 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32) 4723 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64) 4724 4725 static int16_t do_float16_logb_as_int(float16 a, float_status *s) 4726 { 4727 /* Extract frac to the top of the uint32_t. */ 4728 uint32_t frac = (uint32_t)a << (16 + 6); 4729 int16_t exp = extract32(a, 10, 5); 4730 4731 if (unlikely(exp == 0)) { 4732 if (frac != 0) { 4733 if (!get_flush_inputs_to_zero(s)) { 4734 /* denormal: bias - fractional_zeros */ 4735 return -15 - clz32(frac); 4736 } 4737 /* flush to zero */ 4738 float_raise(float_flag_input_denormal_flushed, s); 4739 } 4740 } else if (unlikely(exp == 0x1f)) { 4741 if (frac == 0) { 4742 return INT16_MAX; /* infinity */ 4743 } 4744 } else { 4745 /* normal: exp - bias */ 4746 return exp - 15; 4747 } 4748 /* nan or zero */ 4749 float_raise(float_flag_invalid, s); 4750 return INT16_MIN; 4751 } 4752 4753 static int32_t do_float32_logb_as_int(float32 a, float_status *s) 4754 { 4755 /* Extract frac to the top of the uint32_t. */ 4756 uint32_t frac = a << 9; 4757 int32_t exp = extract32(a, 23, 8); 4758 4759 if (unlikely(exp == 0)) { 4760 if (frac != 0) { 4761 if (!get_flush_inputs_to_zero(s)) { 4762 /* denormal: bias - fractional_zeros */ 4763 return -127 - clz32(frac); 4764 } 4765 /* flush to zero */ 4766 float_raise(float_flag_input_denormal_flushed, s); 4767 } 4768 } else if (unlikely(exp == 0xff)) { 4769 if (frac == 0) { 4770 return INT32_MAX; /* infinity */ 4771 } 4772 } else { 4773 /* normal: exp - bias */ 4774 return exp - 127; 4775 } 4776 /* nan or zero */ 4777 float_raise(float_flag_invalid, s); 4778 return INT32_MIN; 4779 } 4780 4781 static int64_t do_float64_logb_as_int(float64 a, float_status *s) 4782 { 4783 /* Extract frac to the top of the uint64_t. */ 4784 uint64_t frac = a << 12; 4785 int64_t exp = extract64(a, 52, 11); 4786 4787 if (unlikely(exp == 0)) { 4788 if (frac != 0) { 4789 if (!get_flush_inputs_to_zero(s)) { 4790 /* denormal: bias - fractional_zeros */ 4791 return -1023 - clz64(frac); 4792 } 4793 /* flush to zero */ 4794 float_raise(float_flag_input_denormal_flushed, s); 4795 } 4796 } else if (unlikely(exp == 0x7ff)) { 4797 if (frac == 0) { 4798 return INT64_MAX; /* infinity */ 4799 } 4800 } else { 4801 /* normal: exp - bias */ 4802 return exp - 1023; 4803 } 4804 /* nan or zero */ 4805 float_raise(float_flag_invalid, s); 4806 return INT64_MIN; 4807 } 4808 4809 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int) 4810 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int) 4811 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) 4812 4813 #undef DO_ZPZ_FP 4814 4815 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, 4816 float_status *status, uint32_t desc, 4817 uint16_t neg1, uint16_t neg3, int flags) 4818 { 4819 intptr_t i = simd_oprsz(desc); 4820 uint64_t *g = vg; 4821 4822 do { 4823 uint64_t pg = g[(i - 1) >> 6]; 4824 do { 4825 i -= 2; 4826 if (likely((pg >> (i & 63)) & 1)) { 4827 float16 e1, e2, e3, r; 4828 4829 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; 4830 e2 = *(uint16_t *)(vm + H1_2(i)); 4831 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; 4832 r = float16_muladd(e1, e2, e3, flags, status); 4833 *(uint16_t *)(vd + H1_2(i)) = r; 4834 } 4835 } while (i & 63); 4836 } while (i != 0); 4837 } 4838 4839 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4840 void *vg, float_status *status, uint32_t desc) 4841 { 4842 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4843 } 4844 4845 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4846 void *vg, float_status *status, uint32_t desc) 4847 { 4848 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); 4849 } 4850 4851 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4852 void *vg, float_status *status, uint32_t desc) 4853 { 4854 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); 4855 } 4856 4857 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4858 void *vg, float_status *status, uint32_t desc) 4859 { 4860 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); 4861 } 4862 4863 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4864 void *vg, float_status *status, uint32_t desc) 4865 { 4866 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4867 float_muladd_negate_product); 4868 } 4869 4870 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4871 void *vg, float_status *status, uint32_t desc) 4872 { 4873 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4874 float_muladd_negate_product | float_muladd_negate_c); 4875 } 4876 4877 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 4878 void *vg, float_status *status, uint32_t desc) 4879 { 4880 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 4881 float_muladd_negate_c); 4882 } 4883 4884 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, 4885 float_status *status, uint32_t desc, 4886 uint32_t neg1, uint32_t neg3, int flags) 4887 { 4888 intptr_t i = simd_oprsz(desc); 4889 uint64_t *g = vg; 4890 4891 do { 4892 uint64_t pg = g[(i - 1) >> 6]; 4893 do { 4894 i -= 4; 4895 if (likely((pg >> (i & 63)) & 1)) { 4896 float32 e1, e2, e3, r; 4897 4898 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; 4899 e2 = *(uint32_t *)(vm + H1_4(i)); 4900 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; 4901 r = float32_muladd(e1, e2, e3, flags, status); 4902 *(uint32_t *)(vd + H1_4(i)) = r; 4903 } 4904 } while (i & 63); 4905 } while (i != 0); 4906 } 4907 4908 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4909 void *vg, float_status *status, uint32_t desc) 4910 { 4911 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4912 } 4913 4914 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4915 void *vg, float_status *status, uint32_t desc) 4916 { 4917 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0); 4918 } 4919 4920 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4921 void *vg, float_status *status, uint32_t desc) 4922 { 4923 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0); 4924 } 4925 4926 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4927 void *vg, float_status *status, uint32_t desc) 4928 { 4929 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0); 4930 } 4931 4932 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4933 void *vg, float_status *status, uint32_t desc) 4934 { 4935 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4936 float_muladd_negate_product); 4937 } 4938 4939 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4940 void *vg, float_status *status, uint32_t desc) 4941 { 4942 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4943 float_muladd_negate_product | float_muladd_negate_c); 4944 } 4945 4946 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 4947 void *vg, float_status *status, uint32_t desc) 4948 { 4949 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 4950 float_muladd_negate_c); 4951 } 4952 4953 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, 4954 float_status *status, uint32_t desc, 4955 uint64_t neg1, uint64_t neg3, int flags) 4956 { 4957 intptr_t i = simd_oprsz(desc); 4958 uint64_t *g = vg; 4959 4960 do { 4961 uint64_t pg = g[(i - 1) >> 6]; 4962 do { 4963 i -= 8; 4964 if (likely((pg >> (i & 63)) & 1)) { 4965 float64 e1, e2, e3, r; 4966 4967 e1 = *(uint64_t *)(vn + i) ^ neg1; 4968 e2 = *(uint64_t *)(vm + i); 4969 e3 = *(uint64_t *)(va + i) ^ neg3; 4970 r = float64_muladd(e1, e2, e3, flags, status); 4971 *(uint64_t *)(vd + i) = r; 4972 } 4973 } while (i & 63); 4974 } while (i != 0); 4975 } 4976 4977 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4978 void *vg, float_status *status, uint32_t desc) 4979 { 4980 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0); 4981 } 4982 4983 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4984 void *vg, float_status *status, uint32_t desc) 4985 { 4986 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0); 4987 } 4988 4989 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4990 void *vg, float_status *status, uint32_t desc) 4991 { 4992 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0); 4993 } 4994 4995 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 4996 void *vg, float_status *status, uint32_t desc) 4997 { 4998 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0); 4999 } 5000 5001 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5002 void *vg, float_status *status, uint32_t desc) 5003 { 5004 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5005 float_muladd_negate_product); 5006 } 5007 5008 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5009 void *vg, float_status *status, uint32_t desc) 5010 { 5011 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5012 float_muladd_negate_product | float_muladd_negate_c); 5013 } 5014 5015 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5016 void *vg, float_status *status, uint32_t desc) 5017 { 5018 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 5019 float_muladd_negate_c); 5020 } 5021 5022 /* Two operand floating-point comparison controlled by a predicate. 5023 * Unlike the integer version, we are not allowed to optimistically 5024 * compare operands, since the comparison may have side effects wrt 5025 * the FPSR. 5026 */ 5027 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ 5028 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ 5029 float_status *status, uint32_t desc) \ 5030 { \ 5031 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5032 uint64_t *d = vd, *g = vg; \ 5033 do { \ 5034 uint64_t out = 0, pg = g[j]; \ 5035 do { \ 5036 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5037 if (likely((pg >> (i & 63)) & 1)) { \ 5038 TYPE nn = *(TYPE *)(vn + H(i)); \ 5039 TYPE mm = *(TYPE *)(vm + H(i)); \ 5040 out |= OP(TYPE, nn, mm, status); \ 5041 } \ 5042 } while (i & 63); \ 5043 d[j--] = out; \ 5044 } while (i > 0); \ 5045 } 5046 5047 #define DO_FPCMP_PPZZ_H(NAME, OP) \ 5048 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) 5049 #define DO_FPCMP_PPZZ_S(NAME, OP) \ 5050 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) 5051 #define DO_FPCMP_PPZZ_D(NAME, OP) \ 5052 DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP) 5053 5054 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \ 5055 DO_FPCMP_PPZZ_H(NAME, OP) \ 5056 DO_FPCMP_PPZZ_S(NAME, OP) \ 5057 DO_FPCMP_PPZZ_D(NAME, OP) 5058 5059 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 5060 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 5061 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 5062 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 5063 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 5064 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 5065 #define DO_FCMUO(TYPE, X, Y, ST) \ 5066 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered 5067 #define DO_FACGE(TYPE, X, Y, ST) \ 5068 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 5069 #define DO_FACGT(TYPE, X, Y, ST) \ 5070 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 5071 5072 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) 5073 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) 5074 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) 5075 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) 5076 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) 5077 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) 5078 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) 5079 5080 #undef DO_FPCMP_PPZZ_ALL 5081 #undef DO_FPCMP_PPZZ_D 5082 #undef DO_FPCMP_PPZZ_S 5083 #undef DO_FPCMP_PPZZ_H 5084 #undef DO_FPCMP_PPZZ 5085 5086 /* One operand floating-point comparison against zero, controlled 5087 * by a predicate. 5088 */ 5089 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ 5090 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 5091 float_status *status, uint32_t desc) \ 5092 { \ 5093 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ 5094 uint64_t *d = vd, *g = vg; \ 5095 do { \ 5096 uint64_t out = 0, pg = g[j]; \ 5097 do { \ 5098 i -= sizeof(TYPE), out <<= sizeof(TYPE); \ 5099 if ((pg >> (i & 63)) & 1) { \ 5100 TYPE nn = *(TYPE *)(vn + H(i)); \ 5101 out |= OP(TYPE, nn, 0, status); \ 5102 } \ 5103 } while (i & 63); \ 5104 d[j--] = out; \ 5105 } while (i > 0); \ 5106 } 5107 5108 #define DO_FPCMP_PPZ0_H(NAME, OP) \ 5109 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) 5110 #define DO_FPCMP_PPZ0_S(NAME, OP) \ 5111 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) 5112 #define DO_FPCMP_PPZ0_D(NAME, OP) \ 5113 DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP) 5114 5115 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \ 5116 DO_FPCMP_PPZ0_H(NAME, OP) \ 5117 DO_FPCMP_PPZ0_S(NAME, OP) \ 5118 DO_FPCMP_PPZ0_D(NAME, OP) 5119 5120 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) 5121 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) 5122 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) 5123 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) 5124 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) 5125 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) 5126 5127 /* FP Trig Multiply-Add. */ 5128 5129 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 5130 float_status *s, uint32_t desc) 5131 { 5132 static const float16 coeff[16] = { 5133 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5134 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 5135 }; 5136 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); 5137 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5138 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5139 float16 *d = vd, *n = vn, *m = vm; 5140 5141 for (i = 0; i < opr_sz; i++) { 5142 float16 mm = m[i]; 5143 intptr_t xx = x; 5144 int flags = 0; 5145 5146 if (float16_is_neg(mm)) { 5147 if (fpcr_ah) { 5148 flags = float_muladd_negate_product; 5149 } else { 5150 mm = float16_abs(mm); 5151 } 5152 xx += 8; 5153 } 5154 d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s); 5155 } 5156 } 5157 5158 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 5159 float_status *s, uint32_t desc) 5160 { 5161 static const float32 coeff[16] = { 5162 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, 5163 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, 5164 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, 5165 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, 5166 }; 5167 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); 5168 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5169 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5170 float32 *d = vd, *n = vn, *m = vm; 5171 5172 for (i = 0; i < opr_sz; i++) { 5173 float32 mm = m[i]; 5174 intptr_t xx = x; 5175 int flags = 0; 5176 5177 if (float32_is_neg(mm)) { 5178 if (fpcr_ah) { 5179 flags = float_muladd_negate_product; 5180 } else { 5181 mm = float32_abs(mm); 5182 } 5183 xx += 8; 5184 } 5185 d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s); 5186 } 5187 } 5188 5189 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 5190 float_status *s, uint32_t desc) 5191 { 5192 static const float64 coeff[16] = { 5193 0x3ff0000000000000ull, 0xbfc5555555555543ull, 5194 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, 5195 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, 5196 0x3de5d8408868552full, 0x0000000000000000ull, 5197 0x3ff0000000000000ull, 0xbfe0000000000000ull, 5198 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, 5199 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, 5200 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, 5201 }; 5202 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); 5203 intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); 5204 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); 5205 float64 *d = vd, *n = vn, *m = vm; 5206 5207 for (i = 0; i < opr_sz; i++) { 5208 float64 mm = m[i]; 5209 intptr_t xx = x; 5210 int flags = 0; 5211 5212 if (float64_is_neg(mm)) { 5213 if (fpcr_ah) { 5214 flags = float_muladd_negate_product; 5215 } else { 5216 mm = float64_abs(mm); 5217 } 5218 xx += 8; 5219 } 5220 d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s); 5221 } 5222 } 5223 5224 /* 5225 * FP Complex Add 5226 */ 5227 5228 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, 5229 float_status *s, uint32_t desc) 5230 { 5231 intptr_t j, i = simd_oprsz(desc); 5232 uint64_t *g = vg; 5233 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5234 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5235 5236 do { 5237 uint64_t pg = g[(i - 1) >> 6]; 5238 do { 5239 float16 e0, e1, e2, e3; 5240 5241 /* I holds the real index; J holds the imag index. */ 5242 j = i - sizeof(float16); 5243 i -= 2 * sizeof(float16); 5244 5245 e0 = *(float16 *)(vn + H1_2(i)); 5246 e1 = *(float16 *)(vm + H1_2(j)); 5247 e2 = *(float16 *)(vn + H1_2(j)); 5248 e3 = *(float16 *)(vm + H1_2(i)); 5249 5250 if (rot) { 5251 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 5252 } else { 5253 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 5254 } 5255 5256 if (likely((pg >> (i & 63)) & 1)) { 5257 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); 5258 } 5259 if (likely((pg >> (j & 63)) & 1)) { 5260 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s); 5261 } 5262 } while (i & 63); 5263 } while (i != 0); 5264 } 5265 5266 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, 5267 float_status *s, uint32_t desc) 5268 { 5269 intptr_t j, i = simd_oprsz(desc); 5270 uint64_t *g = vg; 5271 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5272 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5273 5274 do { 5275 uint64_t pg = g[(i - 1) >> 6]; 5276 do { 5277 float32 e0, e1, e2, e3; 5278 5279 /* I holds the real index; J holds the imag index. */ 5280 j = i - sizeof(float32); 5281 i -= 2 * sizeof(float32); 5282 5283 e0 = *(float32 *)(vn + H1_2(i)); 5284 e1 = *(float32 *)(vm + H1_2(j)); 5285 e2 = *(float32 *)(vn + H1_2(j)); 5286 e3 = *(float32 *)(vm + H1_2(i)); 5287 5288 if (rot) { 5289 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 5290 } else { 5291 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 5292 } 5293 5294 if (likely((pg >> (i & 63)) & 1)) { 5295 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); 5296 } 5297 if (likely((pg >> (j & 63)) & 1)) { 5298 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s); 5299 } 5300 } while (i & 63); 5301 } while (i != 0); 5302 } 5303 5304 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, 5305 float_status *s, uint32_t desc) 5306 { 5307 intptr_t j, i = simd_oprsz(desc); 5308 uint64_t *g = vg; 5309 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 5310 bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5311 5312 do { 5313 uint64_t pg = g[(i - 1) >> 6]; 5314 do { 5315 float64 e0, e1, e2, e3; 5316 5317 /* I holds the real index; J holds the imag index. */ 5318 j = i - sizeof(float64); 5319 i -= 2 * sizeof(float64); 5320 5321 e0 = *(float64 *)(vn + H1_2(i)); 5322 e1 = *(float64 *)(vm + H1_2(j)); 5323 e2 = *(float64 *)(vn + H1_2(j)); 5324 e3 = *(float64 *)(vm + H1_2(i)); 5325 5326 if (rot) { 5327 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 5328 } else { 5329 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 5330 } 5331 5332 if (likely((pg >> (i & 63)) & 1)) { 5333 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); 5334 } 5335 if (likely((pg >> (j & 63)) & 1)) { 5336 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s); 5337 } 5338 } while (i & 63); 5339 } while (i != 0); 5340 } 5341 5342 /* 5343 * FP Complex Multiply 5344 */ 5345 5346 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, 5347 void *vg, float_status *status, uint32_t desc) 5348 { 5349 intptr_t j, i = simd_oprsz(desc); 5350 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5351 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5352 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5353 uint32_t negf_real = flip ^ negf_imag; 5354 float16 negx_imag, negx_real; 5355 uint64_t *g = vg; 5356 5357 /* With AH=0, use negx; with AH=1 use negf. */ 5358 negx_real = (negf_real & ~fpcr_ah) << 15; 5359 negx_imag = (negf_imag & ~fpcr_ah) << 15; 5360 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5361 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5362 5363 do { 5364 uint64_t pg = g[(i - 1) >> 6]; 5365 do { 5366 float16 e1, e2, e3, e4, nr, ni, mr, mi, d; 5367 5368 /* I holds the real index; J holds the imag index. */ 5369 j = i - sizeof(float16); 5370 i -= 2 * sizeof(float16); 5371 5372 nr = *(float16 *)(vn + H1_2(i)); 5373 ni = *(float16 *)(vn + H1_2(j)); 5374 mr = *(float16 *)(vm + H1_2(i)); 5375 mi = *(float16 *)(vm + H1_2(j)); 5376 5377 e2 = (flip ? ni : nr); 5378 e1 = (flip ? mi : mr) ^ negx_real; 5379 e4 = e2; 5380 e3 = (flip ? mr : mi) ^ negx_imag; 5381 5382 if (likely((pg >> (i & 63)) & 1)) { 5383 d = *(float16 *)(va + H1_2(i)); 5384 d = float16_muladd(e2, e1, d, negf_real, status); 5385 *(float16 *)(vd + H1_2(i)) = d; 5386 } 5387 if (likely((pg >> (j & 63)) & 1)) { 5388 d = *(float16 *)(va + H1_2(j)); 5389 d = float16_muladd(e4, e3, d, negf_imag, status); 5390 *(float16 *)(vd + H1_2(j)) = d; 5391 } 5392 } while (i & 63); 5393 } while (i != 0); 5394 } 5395 5396 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, 5397 void *vg, float_status *status, uint32_t desc) 5398 { 5399 intptr_t j, i = simd_oprsz(desc); 5400 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5401 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5402 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5403 uint32_t negf_real = flip ^ negf_imag; 5404 float32 negx_imag, negx_real; 5405 uint64_t *g = vg; 5406 5407 /* With AH=0, use negx; with AH=1 use negf. */ 5408 negx_real = (negf_real & ~fpcr_ah) << 31; 5409 negx_imag = (negf_imag & ~fpcr_ah) << 31; 5410 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5411 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5412 5413 do { 5414 uint64_t pg = g[(i - 1) >> 6]; 5415 do { 5416 float32 e1, e2, e3, e4, nr, ni, mr, mi, d; 5417 5418 /* I holds the real index; J holds the imag index. */ 5419 j = i - sizeof(float32); 5420 i -= 2 * sizeof(float32); 5421 5422 nr = *(float32 *)(vn + H1_2(i)); 5423 ni = *(float32 *)(vn + H1_2(j)); 5424 mr = *(float32 *)(vm + H1_2(i)); 5425 mi = *(float32 *)(vm + H1_2(j)); 5426 5427 e2 = (flip ? ni : nr); 5428 e1 = (flip ? mi : mr) ^ negx_real; 5429 e4 = e2; 5430 e3 = (flip ? mr : mi) ^ negx_imag; 5431 5432 if (likely((pg >> (i & 63)) & 1)) { 5433 d = *(float32 *)(va + H1_2(i)); 5434 d = float32_muladd(e2, e1, d, negf_real, status); 5435 *(float32 *)(vd + H1_2(i)) = d; 5436 } 5437 if (likely((pg >> (j & 63)) & 1)) { 5438 d = *(float32 *)(va + H1_2(j)); 5439 d = float32_muladd(e4, e3, d, negf_imag, status); 5440 *(float32 *)(vd + H1_2(j)) = d; 5441 } 5442 } while (i & 63); 5443 } while (i != 0); 5444 } 5445 5446 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, 5447 void *vg, float_status *status, uint32_t desc) 5448 { 5449 intptr_t j, i = simd_oprsz(desc); 5450 bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); 5451 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 5452 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 5453 uint32_t negf_real = flip ^ negf_imag; 5454 float64 negx_imag, negx_real; 5455 uint64_t *g = vg; 5456 5457 /* With AH=0, use negx; with AH=1 use negf. */ 5458 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 5459 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 5460 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 5461 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 5462 5463 do { 5464 uint64_t pg = g[(i - 1) >> 6]; 5465 do { 5466 float64 e1, e2, e3, e4, nr, ni, mr, mi, d; 5467 5468 /* I holds the real index; J holds the imag index. */ 5469 j = i - sizeof(float64); 5470 i -= 2 * sizeof(float64); 5471 5472 nr = *(float64 *)(vn + H1_2(i)); 5473 ni = *(float64 *)(vn + H1_2(j)); 5474 mr = *(float64 *)(vm + H1_2(i)); 5475 mi = *(float64 *)(vm + H1_2(j)); 5476 5477 e2 = (flip ? ni : nr); 5478 e1 = (flip ? mi : mr) ^ negx_real; 5479 e4 = e2; 5480 e3 = (flip ? mr : mi) ^ negx_imag; 5481 5482 if (likely((pg >> (i & 63)) & 1)) { 5483 d = *(float64 *)(va + H1_2(i)); 5484 d = float64_muladd(e2, e1, d, negf_real, status); 5485 *(float64 *)(vd + H1_2(i)) = d; 5486 } 5487 if (likely((pg >> (j & 63)) & 1)) { 5488 d = *(float64 *)(va + H1_2(j)); 5489 d = float64_muladd(e4, e3, d, negf_imag, status); 5490 *(float64 *)(vd + H1_2(j)) = d; 5491 } 5492 } while (i & 63); 5493 } while (i != 0); 5494 } 5495 5496 /* 5497 * Load contiguous data, protected by a governing predicate. 5498 */ 5499 5500 /* 5501 * Skip through a sequence of inactive elements in the guarding predicate @vg, 5502 * beginning at @reg_off bounded by @reg_max. Return the offset of the active 5503 * element >= @reg_off, or @reg_max if there were no active elements at all. 5504 */ 5505 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off, 5506 intptr_t reg_max, int esz) 5507 { 5508 uint64_t pg_mask = pred_esz_masks[esz]; 5509 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63); 5510 5511 /* In normal usage, the first element is active. */ 5512 if (likely(pg & 1)) { 5513 return reg_off; 5514 } 5515 5516 if (pg == 0) { 5517 reg_off &= -64; 5518 do { 5519 reg_off += 64; 5520 if (unlikely(reg_off >= reg_max)) { 5521 /* The entire predicate was false. */ 5522 return reg_max; 5523 } 5524 pg = vg[reg_off >> 6] & pg_mask; 5525 } while (pg == 0); 5526 } 5527 reg_off += ctz64(pg); 5528 5529 /* We should never see an out of range predicate bit set. */ 5530 tcg_debug_assert(reg_off < reg_max); 5531 return reg_off; 5532 } 5533 5534 /* 5535 * Resolve the guest virtual address to info->host and info->flags. 5536 * If @nofault, return false if the page is invalid, otherwise 5537 * exit via page fault exception. 5538 */ 5539 5540 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env, 5541 target_ulong addr, int mem_off, MMUAccessType access_type, 5542 int mmu_idx, uintptr_t retaddr) 5543 { 5544 int flags; 5545 5546 addr += mem_off; 5547 5548 /* 5549 * User-only currently always issues with TBI. See the comment 5550 * above useronly_clean_ptr. Usually we clean this top byte away 5551 * during translation, but we can't do that for e.g. vector + imm 5552 * addressing modes. 5553 * 5554 * We currently always enable TBI for user-only, and do not provide 5555 * a way to turn it off. So clean the pointer unconditionally here, 5556 * rather than look it up here, or pass it down from above. 5557 */ 5558 addr = useronly_clean_ptr(addr); 5559 5560 #ifdef CONFIG_USER_ONLY 5561 flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault, 5562 &info->host, retaddr); 5563 #else 5564 CPUTLBEntryFull *full; 5565 flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault, 5566 &info->host, &full, retaddr); 5567 #endif 5568 info->flags = flags; 5569 5570 if (flags & TLB_INVALID_MASK) { 5571 g_assert(nofault); 5572 return false; 5573 } 5574 5575 #ifdef CONFIG_USER_ONLY 5576 memset(&info->attrs, 0, sizeof(info->attrs)); 5577 /* Require both ANON and MTE; see allocation_tag_mem(). */ 5578 info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE); 5579 #else 5580 info->attrs = full->attrs; 5581 info->tagged = full->extra.arm.pte_attrs == 0xf0; 5582 #endif 5583 5584 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */ 5585 info->host -= mem_off; 5586 return true; 5587 } 5588 5589 /* 5590 * Find first active element on each page, and a loose bound for the 5591 * final element on each page. Identify any single element that spans 5592 * the page boundary. Return true if there are any active elements. 5593 */ 5594 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg, 5595 intptr_t reg_max, int esz, int msize) 5596 { 5597 const int esize = 1 << esz; 5598 const uint64_t pg_mask = pred_esz_masks[esz]; 5599 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split; 5600 intptr_t mem_off_last, mem_off_split; 5601 intptr_t page_split, elt_split; 5602 intptr_t i; 5603 5604 /* Set all of the element indices to -1, and the TLB data to 0. */ 5605 memset(info, -1, offsetof(SVEContLdSt, page)); 5606 memset(info->page, 0, sizeof(info->page)); 5607 5608 /* Gross scan over the entire predicate to find bounds. */ 5609 i = 0; 5610 do { 5611 uint64_t pg = vg[i] & pg_mask; 5612 if (pg) { 5613 reg_off_last = i * 64 + 63 - clz64(pg); 5614 if (reg_off_first < 0) { 5615 reg_off_first = i * 64 + ctz64(pg); 5616 } 5617 } 5618 } while (++i * 64 < reg_max); 5619 5620 if (unlikely(reg_off_first < 0)) { 5621 /* No active elements, no pages touched. */ 5622 return false; 5623 } 5624 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max); 5625 5626 info->reg_off_first[0] = reg_off_first; 5627 info->mem_off_first[0] = (reg_off_first >> esz) * msize; 5628 mem_off_last = (reg_off_last >> esz) * msize; 5629 5630 page_split = -(addr | TARGET_PAGE_MASK); 5631 if (likely(mem_off_last + msize <= page_split)) { 5632 /* The entire operation fits within a single page. */ 5633 info->reg_off_last[0] = reg_off_last; 5634 return true; 5635 } 5636 5637 info->page_split = page_split; 5638 elt_split = page_split / msize; 5639 reg_off_split = elt_split << esz; 5640 mem_off_split = elt_split * msize; 5641 5642 /* 5643 * This is the last full element on the first page, but it is not 5644 * necessarily active. If there is no full element, i.e. the first 5645 * active element is the one that's split, this value remains -1. 5646 * It is useful as iteration bounds. 5647 */ 5648 if (elt_split != 0) { 5649 info->reg_off_last[0] = reg_off_split - esize; 5650 } 5651 5652 /* Determine if an unaligned element spans the pages. */ 5653 if (page_split % msize != 0) { 5654 /* It is helpful to know if the split element is active. */ 5655 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) { 5656 info->reg_off_split = reg_off_split; 5657 info->mem_off_split = mem_off_split; 5658 5659 if (reg_off_split == reg_off_last) { 5660 /* The page crossing element is last. */ 5661 return true; 5662 } 5663 } 5664 reg_off_split += esize; 5665 mem_off_split += msize; 5666 } 5667 5668 /* 5669 * We do want the first active element on the second page, because 5670 * this may affect the address reported in an exception. 5671 */ 5672 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz); 5673 tcg_debug_assert(reg_off_split <= reg_off_last); 5674 info->reg_off_first[1] = reg_off_split; 5675 info->mem_off_first[1] = (reg_off_split >> esz) * msize; 5676 info->reg_off_last[1] = reg_off_last; 5677 return true; 5678 } 5679 5680 /* 5681 * Resolve the guest virtual addresses to info->page[]. 5682 * Control the generation of page faults with @fault. Return false if 5683 * there is no work to do, which can only happen with @fault == FAULT_NO. 5684 */ 5685 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault, 5686 CPUARMState *env, target_ulong addr, 5687 MMUAccessType access_type, uintptr_t retaddr) 5688 { 5689 int mmu_idx = arm_env_mmu_index(env); 5690 int mem_off = info->mem_off_first[0]; 5691 bool nofault = fault == FAULT_NO; 5692 bool have_work = true; 5693 5694 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off, 5695 access_type, mmu_idx, retaddr)) { 5696 /* No work to be done. */ 5697 return false; 5698 } 5699 5700 if (likely(info->page_split < 0)) { 5701 /* The entire operation was on the one page. */ 5702 return true; 5703 } 5704 5705 /* 5706 * If the second page is invalid, then we want the fault address to be 5707 * the first byte on that page which is accessed. 5708 */ 5709 if (info->mem_off_split >= 0) { 5710 /* 5711 * There is an element split across the pages. The fault address 5712 * should be the first byte of the second page. 5713 */ 5714 mem_off = info->page_split; 5715 /* 5716 * If the split element is also the first active element 5717 * of the vector, then: For first-fault we should continue 5718 * to generate faults for the second page. For no-fault, 5719 * we have work only if the second page is valid. 5720 */ 5721 if (info->mem_off_first[0] < info->mem_off_split) { 5722 nofault = FAULT_FIRST; 5723 have_work = false; 5724 } 5725 } else { 5726 /* 5727 * There is no element split across the pages. The fault address 5728 * should be the first active element on the second page. 5729 */ 5730 mem_off = info->mem_off_first[1]; 5731 /* 5732 * There must have been one active element on the first page, 5733 * so we're out of first-fault territory. 5734 */ 5735 nofault = fault != FAULT_ALL; 5736 } 5737 5738 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off, 5739 access_type, mmu_idx, retaddr); 5740 return have_work; 5741 } 5742 5743 #ifndef CONFIG_USER_ONLY 5744 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, 5745 uint64_t *vg, target_ulong addr, 5746 int esize, int msize, int wp_access, 5747 uintptr_t retaddr) 5748 { 5749 intptr_t mem_off, reg_off, reg_last; 5750 int flags0 = info->page[0].flags; 5751 int flags1 = info->page[1].flags; 5752 5753 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) { 5754 return; 5755 } 5756 5757 /* Indicate that watchpoints are handled. */ 5758 info->page[0].flags = flags0 & ~TLB_WATCHPOINT; 5759 info->page[1].flags = flags1 & ~TLB_WATCHPOINT; 5760 5761 if (flags0 & TLB_WATCHPOINT) { 5762 mem_off = info->mem_off_first[0]; 5763 reg_off = info->reg_off_first[0]; 5764 reg_last = info->reg_off_last[0]; 5765 5766 while (reg_off <= reg_last) { 5767 uint64_t pg = vg[reg_off >> 6]; 5768 do { 5769 if ((pg >> (reg_off & 63)) & 1) { 5770 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5771 msize, info->page[0].attrs, 5772 wp_access, retaddr); 5773 } 5774 reg_off += esize; 5775 mem_off += msize; 5776 } while (reg_off <= reg_last && (reg_off & 63)); 5777 } 5778 } 5779 5780 mem_off = info->mem_off_split; 5781 if (mem_off >= 0) { 5782 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize, 5783 info->page[0].attrs, wp_access, retaddr); 5784 } 5785 5786 mem_off = info->mem_off_first[1]; 5787 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) { 5788 reg_off = info->reg_off_first[1]; 5789 reg_last = info->reg_off_last[1]; 5790 5791 do { 5792 uint64_t pg = vg[reg_off >> 6]; 5793 do { 5794 if ((pg >> (reg_off & 63)) & 1) { 5795 cpu_check_watchpoint(env_cpu(env), addr + mem_off, 5796 msize, info->page[1].attrs, 5797 wp_access, retaddr); 5798 } 5799 reg_off += esize; 5800 mem_off += msize; 5801 } while (reg_off & 63); 5802 } while (reg_off <= reg_last); 5803 } 5804 } 5805 #endif 5806 5807 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env, 5808 uint64_t *vg, target_ulong addr, int esize, 5809 int msize, uint32_t mtedesc, uintptr_t ra) 5810 { 5811 intptr_t mem_off, reg_off, reg_last; 5812 5813 /* Process the page only if MemAttr == Tagged. */ 5814 if (info->page[0].tagged) { 5815 mem_off = info->mem_off_first[0]; 5816 reg_off = info->reg_off_first[0]; 5817 reg_last = info->reg_off_split; 5818 if (reg_last < 0) { 5819 reg_last = info->reg_off_last[0]; 5820 } 5821 5822 do { 5823 uint64_t pg = vg[reg_off >> 6]; 5824 do { 5825 if ((pg >> (reg_off & 63)) & 1) { 5826 mte_check(env, mtedesc, addr, ra); 5827 } 5828 reg_off += esize; 5829 mem_off += msize; 5830 } while (reg_off <= reg_last && (reg_off & 63)); 5831 } while (reg_off <= reg_last); 5832 } 5833 5834 mem_off = info->mem_off_first[1]; 5835 if (mem_off >= 0 && info->page[1].tagged) { 5836 reg_off = info->reg_off_first[1]; 5837 reg_last = info->reg_off_last[1]; 5838 5839 do { 5840 uint64_t pg = vg[reg_off >> 6]; 5841 do { 5842 if ((pg >> (reg_off & 63)) & 1) { 5843 mte_check(env, mtedesc, addr, ra); 5844 } 5845 reg_off += esize; 5846 mem_off += msize; 5847 } while (reg_off & 63); 5848 } while (reg_off <= reg_last); 5849 } 5850 } 5851 5852 /* 5853 * Common helper for all contiguous 1,2,3,4-register predicated stores. 5854 */ 5855 static inline QEMU_ALWAYS_INLINE 5856 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, 5857 uint32_t desc, const uintptr_t retaddr, 5858 const int esz, const int msz, const int N, uint32_t mtedesc, 5859 sve_ldst1_host_fn *host_fn, 5860 sve_ldst1_tlb_fn *tlb_fn) 5861 { 5862 const unsigned rd = simd_data(desc); 5863 const intptr_t reg_max = simd_oprsz(desc); 5864 intptr_t reg_off, reg_last, mem_off; 5865 SVEContLdSt info; 5866 void *host; 5867 int flags, i; 5868 5869 /* Find the active elements. */ 5870 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 5871 /* The entire predicate was false; no load occurs. */ 5872 for (i = 0; i < N; ++i) { 5873 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5874 } 5875 return; 5876 } 5877 5878 /* Probe the page(s). Exit with exception for any invalid page. */ 5879 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr); 5880 5881 /* Handle watchpoints for all active elements. */ 5882 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 5883 BP_MEM_READ, retaddr); 5884 5885 /* 5886 * Handle mte checks for all active elements. 5887 * Since TBI must be set for MTE, !mtedesc => !mte_active. 5888 */ 5889 if (mtedesc) { 5890 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 5891 mtedesc, retaddr); 5892 } 5893 5894 flags = info.page[0].flags | info.page[1].flags; 5895 if (unlikely(flags != 0)) { 5896 /* 5897 * At least one page includes MMIO. 5898 * Any bus operation can fail with cpu_transaction_failed, 5899 * which for ARM will raise SyncExternal. Perform the load 5900 * into scratch memory to preserve register state until the end. 5901 */ 5902 ARMVectorReg scratch[4] = { }; 5903 5904 mem_off = info.mem_off_first[0]; 5905 reg_off = info.reg_off_first[0]; 5906 reg_last = info.reg_off_last[1]; 5907 if (reg_last < 0) { 5908 reg_last = info.reg_off_split; 5909 if (reg_last < 0) { 5910 reg_last = info.reg_off_last[0]; 5911 } 5912 } 5913 5914 do { 5915 uint64_t pg = vg[reg_off >> 6]; 5916 do { 5917 if ((pg >> (reg_off & 63)) & 1) { 5918 for (i = 0; i < N; ++i) { 5919 tlb_fn(env, &scratch[i], reg_off, 5920 addr + mem_off + (i << msz), retaddr); 5921 } 5922 } 5923 reg_off += 1 << esz; 5924 mem_off += N << msz; 5925 } while (reg_off & 63); 5926 } while (reg_off <= reg_last); 5927 5928 for (i = 0; i < N; ++i) { 5929 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max); 5930 } 5931 return; 5932 } 5933 5934 /* The entire operation is in RAM, on valid pages. */ 5935 5936 for (i = 0; i < N; ++i) { 5937 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max); 5938 } 5939 5940 mem_off = info.mem_off_first[0]; 5941 reg_off = info.reg_off_first[0]; 5942 reg_last = info.reg_off_last[0]; 5943 host = info.page[0].host; 5944 5945 set_helper_retaddr(retaddr); 5946 5947 while (reg_off <= reg_last) { 5948 uint64_t pg = vg[reg_off >> 6]; 5949 do { 5950 if ((pg >> (reg_off & 63)) & 1) { 5951 for (i = 0; i < N; ++i) { 5952 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5953 host + mem_off + (i << msz)); 5954 } 5955 } 5956 reg_off += 1 << esz; 5957 mem_off += N << msz; 5958 } while (reg_off <= reg_last && (reg_off & 63)); 5959 } 5960 5961 clear_helper_retaddr(); 5962 5963 /* 5964 * Use the slow path to manage the cross-page misalignment. 5965 * But we know this is RAM and cannot trap. 5966 */ 5967 mem_off = info.mem_off_split; 5968 if (unlikely(mem_off >= 0)) { 5969 reg_off = info.reg_off_split; 5970 for (i = 0; i < N; ++i) { 5971 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 5972 addr + mem_off + (i << msz), retaddr); 5973 } 5974 } 5975 5976 mem_off = info.mem_off_first[1]; 5977 if (unlikely(mem_off >= 0)) { 5978 reg_off = info.reg_off_first[1]; 5979 reg_last = info.reg_off_last[1]; 5980 host = info.page[1].host; 5981 5982 set_helper_retaddr(retaddr); 5983 5984 do { 5985 uint64_t pg = vg[reg_off >> 6]; 5986 do { 5987 if ((pg >> (reg_off & 63)) & 1) { 5988 for (i = 0; i < N; ++i) { 5989 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 5990 host + mem_off + (i << msz)); 5991 } 5992 } 5993 reg_off += 1 << esz; 5994 mem_off += N << msz; 5995 } while (reg_off & 63); 5996 } while (reg_off <= reg_last); 5997 5998 clear_helper_retaddr(); 5999 } 6000 } 6001 6002 static inline QEMU_ALWAYS_INLINE 6003 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6004 uint32_t desc, const uintptr_t ra, 6005 const int esz, const int msz, const int N, 6006 sve_ldst1_host_fn *host_fn, 6007 sve_ldst1_tlb_fn *tlb_fn) 6008 { 6009 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6010 int bit55 = extract64(addr, 55, 1); 6011 6012 /* Remove mtedesc from the normal sve descriptor. */ 6013 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6014 6015 /* Perform gross MTE suppression early. */ 6016 if (!tbi_check(mtedesc, bit55) || 6017 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6018 mtedesc = 0; 6019 } 6020 6021 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6022 } 6023 6024 #define DO_LD1_1(NAME, ESZ) \ 6025 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ 6026 target_ulong addr, uint32_t desc) \ 6027 { \ 6028 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ 6029 sve_##NAME##_host, sve_##NAME##_tlb); \ 6030 } \ 6031 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6032 target_ulong addr, uint32_t desc) \ 6033 { \ 6034 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ 6035 sve_##NAME##_host, sve_##NAME##_tlb); \ 6036 } 6037 6038 #define DO_LD1_2(NAME, ESZ, MSZ) \ 6039 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ 6040 target_ulong addr, uint32_t desc) \ 6041 { \ 6042 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6043 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6044 } \ 6045 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ 6046 target_ulong addr, uint32_t desc) \ 6047 { \ 6048 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ 6049 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6050 } \ 6051 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6052 target_ulong addr, uint32_t desc) \ 6053 { \ 6054 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6055 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ 6056 } \ 6057 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6058 target_ulong addr, uint32_t desc) \ 6059 { \ 6060 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ 6061 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ 6062 } 6063 6064 DO_LD1_1(ld1bb, MO_8) 6065 DO_LD1_1(ld1bhu, MO_16) 6066 DO_LD1_1(ld1bhs, MO_16) 6067 DO_LD1_1(ld1bsu, MO_32) 6068 DO_LD1_1(ld1bss, MO_32) 6069 DO_LD1_1(ld1bdu, MO_64) 6070 DO_LD1_1(ld1bds, MO_64) 6071 6072 DO_LD1_2(ld1hh, MO_16, MO_16) 6073 DO_LD1_2(ld1hsu, MO_32, MO_16) 6074 DO_LD1_2(ld1hss, MO_32, MO_16) 6075 DO_LD1_2(ld1hdu, MO_64, MO_16) 6076 DO_LD1_2(ld1hds, MO_64, MO_16) 6077 6078 DO_LD1_2(ld1ss, MO_32, MO_32) 6079 DO_LD1_2(ld1sdu, MO_64, MO_32) 6080 DO_LD1_2(ld1sds, MO_64, MO_32) 6081 6082 DO_LD1_2(ld1dd, MO_64, MO_64) 6083 6084 #undef DO_LD1_1 6085 #undef DO_LD1_2 6086 6087 #define DO_LDN_1(N) \ 6088 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ 6089 target_ulong addr, uint32_t desc) \ 6090 { \ 6091 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ 6092 sve_ld1bb_host, sve_ld1bb_tlb); \ 6093 } \ 6094 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ 6095 target_ulong addr, uint32_t desc) \ 6096 { \ 6097 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ 6098 sve_ld1bb_host, sve_ld1bb_tlb); \ 6099 } 6100 6101 #define DO_LDN_2(N, SUFF, ESZ) \ 6102 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ 6103 target_ulong addr, uint32_t desc) \ 6104 { \ 6105 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6106 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6107 } \ 6108 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ 6109 target_ulong addr, uint32_t desc) \ 6110 { \ 6111 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ 6112 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6113 } \ 6114 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ 6115 target_ulong addr, uint32_t desc) \ 6116 { \ 6117 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6118 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ 6119 } \ 6120 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ 6121 target_ulong addr, uint32_t desc) \ 6122 { \ 6123 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ 6124 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ 6125 } 6126 6127 DO_LDN_1(2) 6128 DO_LDN_1(3) 6129 DO_LDN_1(4) 6130 6131 DO_LDN_2(2, hh, MO_16) 6132 DO_LDN_2(3, hh, MO_16) 6133 DO_LDN_2(4, hh, MO_16) 6134 6135 DO_LDN_2(2, ss, MO_32) 6136 DO_LDN_2(3, ss, MO_32) 6137 DO_LDN_2(4, ss, MO_32) 6138 6139 DO_LDN_2(2, dd, MO_64) 6140 DO_LDN_2(3, dd, MO_64) 6141 DO_LDN_2(4, dd, MO_64) 6142 6143 #undef DO_LDN_1 6144 #undef DO_LDN_2 6145 6146 /* 6147 * Load contiguous data, first-fault and no-fault. 6148 * 6149 * For user-only, we control the race between page_check_range and 6150 * another thread's munmap by using set/clear_helper_retaddr. Any 6151 * SEGV that occurs between those markers is assumed to be because 6152 * the guest page vanished. Keep that block as small as possible 6153 * so that unrelated QEMU bugs are not blamed on the guest. 6154 */ 6155 6156 /* Fault on byte I. All bits in FFR from I are cleared. The vector 6157 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE 6158 * option, which leaves subsequent data unchanged. 6159 */ 6160 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) 6161 { 6162 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; 6163 6164 if (i & 63) { 6165 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); 6166 i = ROUND_UP(i, 64); 6167 } 6168 for (; i < oprsz; i += 64) { 6169 ffr[i / 64] = 0; 6170 } 6171 } 6172 6173 /* 6174 * Common helper for all contiguous no-fault and first-fault loads. 6175 */ 6176 static inline QEMU_ALWAYS_INLINE 6177 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, 6178 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, 6179 const int esz, const int msz, const SVEContFault fault, 6180 sve_ldst1_host_fn *host_fn, 6181 sve_ldst1_tlb_fn *tlb_fn) 6182 { 6183 const unsigned rd = simd_data(desc); 6184 void *vd = &env->vfp.zregs[rd]; 6185 const intptr_t reg_max = simd_oprsz(desc); 6186 intptr_t reg_off, mem_off, reg_last; 6187 SVEContLdSt info; 6188 int flags; 6189 void *host; 6190 6191 /* Find the active elements. */ 6192 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) { 6193 /* The entire predicate was false; no load occurs. */ 6194 memset(vd, 0, reg_max); 6195 return; 6196 } 6197 reg_off = info.reg_off_first[0]; 6198 6199 /* Probe the page(s). */ 6200 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) { 6201 /* Fault on first element. */ 6202 tcg_debug_assert(fault == FAULT_NO); 6203 memset(vd, 0, reg_max); 6204 goto do_fault; 6205 } 6206 6207 mem_off = info.mem_off_first[0]; 6208 flags = info.page[0].flags; 6209 6210 /* 6211 * Disable MTE checking if the Tagged bit is not set. Since TBI must 6212 * be set within MTEDESC for MTE, !mtedesc => !mte_active. 6213 */ 6214 if (!info.page[0].tagged) { 6215 mtedesc = 0; 6216 } 6217 6218 if (fault == FAULT_FIRST) { 6219 /* Trapping mte check for the first-fault element. */ 6220 if (mtedesc) { 6221 mte_check(env, mtedesc, addr + mem_off, retaddr); 6222 } 6223 6224 /* 6225 * Special handling of the first active element, 6226 * if it crosses a page boundary or is MMIO. 6227 */ 6228 bool is_split = mem_off == info.mem_off_split; 6229 if (unlikely(flags != 0) || unlikely(is_split)) { 6230 /* 6231 * Use the slow path for cross-page handling. 6232 * Might trap for MMIO or watchpoints. 6233 */ 6234 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6235 6236 /* After any fault, zero the other elements. */ 6237 swap_memzero(vd, reg_off); 6238 reg_off += 1 << esz; 6239 mem_off += 1 << msz; 6240 swap_memzero(vd + reg_off, reg_max - reg_off); 6241 6242 if (is_split) { 6243 goto second_page; 6244 } 6245 } else { 6246 memset(vd, 0, reg_max); 6247 } 6248 } else { 6249 memset(vd, 0, reg_max); 6250 if (unlikely(mem_off == info.mem_off_split)) { 6251 /* The first active element crosses a page boundary. */ 6252 flags |= info.page[1].flags; 6253 if (unlikely(flags & TLB_MMIO)) { 6254 /* Some page is MMIO, see below. */ 6255 goto do_fault; 6256 } 6257 if (unlikely(flags & TLB_WATCHPOINT) && 6258 (cpu_watchpoint_address_matches 6259 (env_cpu(env), addr + mem_off, 1 << msz) 6260 & BP_MEM_READ)) { 6261 /* Watchpoint hit, see below. */ 6262 goto do_fault; 6263 } 6264 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6265 goto do_fault; 6266 } 6267 /* 6268 * Use the slow path for cross-page handling. 6269 * This is RAM, without a watchpoint, and will not trap. 6270 */ 6271 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr); 6272 goto second_page; 6273 } 6274 } 6275 6276 /* 6277 * From this point on, all memory operations are MemSingleNF. 6278 * 6279 * Per the MemSingleNF pseudocode, a no-fault load from Device memory 6280 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead. 6281 * 6282 * Unfortuately we do not have access to the memory attributes from the 6283 * PTE to tell Device memory from Normal memory. So we make a mostly 6284 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO. 6285 * This gives the right answer for the common cases of "Normal memory, 6286 * backed by host RAM" and "Device memory, backed by MMIO". 6287 * The architecture allows us to suppress an NF load and return 6288 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner 6289 * case of "Normal memory, backed by MMIO" is permitted. The case we 6290 * get wrong is "Device memory, backed by host RAM", for which we 6291 * should return (UNKNOWN, FAULT) for but do not. 6292 * 6293 * Similarly, CPU_BP breakpoints would raise exceptions, and so 6294 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and 6295 * architectural breakpoints the same. 6296 */ 6297 if (unlikely(flags & TLB_MMIO)) { 6298 goto do_fault; 6299 } 6300 6301 reg_last = info.reg_off_last[0]; 6302 host = info.page[0].host; 6303 6304 set_helper_retaddr(retaddr); 6305 6306 do { 6307 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3)); 6308 do { 6309 if ((pg >> (reg_off & 63)) & 1) { 6310 if (unlikely(flags & TLB_WATCHPOINT) && 6311 (cpu_watchpoint_address_matches 6312 (env_cpu(env), addr + mem_off, 1 << msz) 6313 & BP_MEM_READ)) { 6314 clear_helper_retaddr(); 6315 goto do_fault; 6316 } 6317 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) { 6318 clear_helper_retaddr(); 6319 goto do_fault; 6320 } 6321 host_fn(vd, reg_off, host + mem_off); 6322 } 6323 reg_off += 1 << esz; 6324 mem_off += 1 << msz; 6325 } while (reg_off <= reg_last && (reg_off & 63)); 6326 } while (reg_off <= reg_last); 6327 6328 clear_helper_retaddr(); 6329 6330 /* 6331 * MemSingleNF is allowed to fail for any reason. We have special 6332 * code above to handle the first element crossing a page boundary. 6333 * As an implementation choice, decline to handle a cross-page element 6334 * in any other position. 6335 */ 6336 reg_off = info.reg_off_split; 6337 if (reg_off >= 0) { 6338 goto do_fault; 6339 } 6340 6341 second_page: 6342 reg_off = info.reg_off_first[1]; 6343 if (likely(reg_off < 0)) { 6344 /* No active elements on the second page. All done. */ 6345 return; 6346 } 6347 6348 /* 6349 * MemSingleNF is allowed to fail for any reason. As an implementation 6350 * choice, decline to handle elements on the second page. This should 6351 * be low frequency as the guest walks through memory -- the next 6352 * iteration of the guest's loop should be aligned on the page boundary, 6353 * and then all following iterations will stay aligned. 6354 */ 6355 6356 do_fault: 6357 record_fault(env, reg_off, reg_max); 6358 } 6359 6360 static inline QEMU_ALWAYS_INLINE 6361 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, 6362 uint32_t desc, const uintptr_t retaddr, 6363 const int esz, const int msz, const SVEContFault fault, 6364 sve_ldst1_host_fn *host_fn, 6365 sve_ldst1_tlb_fn *tlb_fn) 6366 { 6367 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6368 int bit55 = extract64(addr, 55, 1); 6369 6370 /* Remove mtedesc from the normal sve descriptor. */ 6371 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6372 6373 /* Perform gross MTE suppression early. */ 6374 if (!tbi_check(mtedesc, bit55) || 6375 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6376 mtedesc = 0; 6377 } 6378 6379 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, 6380 esz, msz, fault, host_fn, tlb_fn); 6381 } 6382 6383 #define DO_LDFF1_LDNF1_1(PART, ESZ) \ 6384 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ 6385 target_ulong addr, uint32_t desc) \ 6386 { \ 6387 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ 6388 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6389 } \ 6390 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ 6391 target_ulong addr, uint32_t desc) \ 6392 { \ 6393 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ 6394 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6395 } \ 6396 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6397 target_ulong addr, uint32_t desc) \ 6398 { \ 6399 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ 6400 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6401 } \ 6402 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ 6403 target_ulong addr, uint32_t desc) \ 6404 { \ 6405 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ 6406 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ 6407 } 6408 6409 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ 6410 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ 6411 target_ulong addr, uint32_t desc) \ 6412 { \ 6413 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6414 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6415 } \ 6416 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ 6417 target_ulong addr, uint32_t desc) \ 6418 { \ 6419 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6420 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6421 } \ 6422 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ 6423 target_ulong addr, uint32_t desc) \ 6424 { \ 6425 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ 6426 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6427 } \ 6428 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ 6429 target_ulong addr, uint32_t desc) \ 6430 { \ 6431 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ 6432 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6433 } \ 6434 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6435 target_ulong addr, uint32_t desc) \ 6436 { \ 6437 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6438 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6439 } \ 6440 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ 6441 target_ulong addr, uint32_t desc) \ 6442 { \ 6443 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6444 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ 6445 } \ 6446 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6447 target_ulong addr, uint32_t desc) \ 6448 { \ 6449 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ 6450 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6451 } \ 6452 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ 6453 target_ulong addr, uint32_t desc) \ 6454 { \ 6455 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ 6456 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ 6457 } 6458 6459 DO_LDFF1_LDNF1_1(bb, MO_8) 6460 DO_LDFF1_LDNF1_1(bhu, MO_16) 6461 DO_LDFF1_LDNF1_1(bhs, MO_16) 6462 DO_LDFF1_LDNF1_1(bsu, MO_32) 6463 DO_LDFF1_LDNF1_1(bss, MO_32) 6464 DO_LDFF1_LDNF1_1(bdu, MO_64) 6465 DO_LDFF1_LDNF1_1(bds, MO_64) 6466 6467 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16) 6468 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16) 6469 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16) 6470 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16) 6471 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16) 6472 6473 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32) 6474 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32) 6475 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32) 6476 6477 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) 6478 6479 #undef DO_LDFF1_LDNF1_1 6480 #undef DO_LDFF1_LDNF1_2 6481 6482 /* 6483 * Common helper for all contiguous 1,2,3,4-register predicated stores. 6484 */ 6485 6486 static inline QEMU_ALWAYS_INLINE 6487 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, 6488 uint32_t desc, const uintptr_t retaddr, 6489 const int esz, const int msz, const int N, uint32_t mtedesc, 6490 sve_ldst1_host_fn *host_fn, 6491 sve_ldst1_tlb_fn *tlb_fn) 6492 { 6493 const unsigned rd = simd_data(desc); 6494 const intptr_t reg_max = simd_oprsz(desc); 6495 intptr_t reg_off, reg_last, mem_off; 6496 SVEContLdSt info; 6497 void *host; 6498 int i, flags; 6499 6500 /* Find the active elements. */ 6501 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) { 6502 /* The entire predicate was false; no store occurs. */ 6503 return; 6504 } 6505 6506 /* Probe the page(s). Exit with exception for any invalid page. */ 6507 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr); 6508 6509 /* Handle watchpoints for all active elements. */ 6510 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, 6511 BP_MEM_WRITE, retaddr); 6512 6513 /* 6514 * Handle mte checks for all active elements. 6515 * Since TBI must be set for MTE, !mtedesc => !mte_active. 6516 */ 6517 if (mtedesc) { 6518 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz, 6519 mtedesc, retaddr); 6520 } 6521 6522 flags = info.page[0].flags | info.page[1].flags; 6523 if (unlikely(flags != 0)) { 6524 /* 6525 * At least one page includes MMIO. 6526 * Any bus operation can fail with cpu_transaction_failed, 6527 * which for ARM will raise SyncExternal. We cannot avoid 6528 * this fault and will leave with the store incomplete. 6529 */ 6530 mem_off = info.mem_off_first[0]; 6531 reg_off = info.reg_off_first[0]; 6532 reg_last = info.reg_off_last[1]; 6533 if (reg_last < 0) { 6534 reg_last = info.reg_off_split; 6535 if (reg_last < 0) { 6536 reg_last = info.reg_off_last[0]; 6537 } 6538 } 6539 6540 do { 6541 uint64_t pg = vg[reg_off >> 6]; 6542 do { 6543 if ((pg >> (reg_off & 63)) & 1) { 6544 for (i = 0; i < N; ++i) { 6545 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6546 addr + mem_off + (i << msz), retaddr); 6547 } 6548 } 6549 reg_off += 1 << esz; 6550 mem_off += N << msz; 6551 } while (reg_off & 63); 6552 } while (reg_off <= reg_last); 6553 return; 6554 } 6555 6556 mem_off = info.mem_off_first[0]; 6557 reg_off = info.reg_off_first[0]; 6558 reg_last = info.reg_off_last[0]; 6559 host = info.page[0].host; 6560 6561 set_helper_retaddr(retaddr); 6562 6563 while (reg_off <= reg_last) { 6564 uint64_t pg = vg[reg_off >> 6]; 6565 do { 6566 if ((pg >> (reg_off & 63)) & 1) { 6567 for (i = 0; i < N; ++i) { 6568 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6569 host + mem_off + (i << msz)); 6570 } 6571 } 6572 reg_off += 1 << esz; 6573 mem_off += N << msz; 6574 } while (reg_off <= reg_last && (reg_off & 63)); 6575 } 6576 6577 clear_helper_retaddr(); 6578 6579 /* 6580 * Use the slow path to manage the cross-page misalignment. 6581 * But we know this is RAM and cannot trap. 6582 */ 6583 mem_off = info.mem_off_split; 6584 if (unlikely(mem_off >= 0)) { 6585 reg_off = info.reg_off_split; 6586 for (i = 0; i < N; ++i) { 6587 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off, 6588 addr + mem_off + (i << msz), retaddr); 6589 } 6590 } 6591 6592 mem_off = info.mem_off_first[1]; 6593 if (unlikely(mem_off >= 0)) { 6594 reg_off = info.reg_off_first[1]; 6595 reg_last = info.reg_off_last[1]; 6596 host = info.page[1].host; 6597 6598 set_helper_retaddr(retaddr); 6599 6600 do { 6601 uint64_t pg = vg[reg_off >> 6]; 6602 do { 6603 if ((pg >> (reg_off & 63)) & 1) { 6604 for (i = 0; i < N; ++i) { 6605 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off, 6606 host + mem_off + (i << msz)); 6607 } 6608 } 6609 reg_off += 1 << esz; 6610 mem_off += N << msz; 6611 } while (reg_off & 63); 6612 } while (reg_off <= reg_last); 6613 6614 clear_helper_retaddr(); 6615 } 6616 } 6617 6618 static inline QEMU_ALWAYS_INLINE 6619 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, 6620 uint32_t desc, const uintptr_t ra, 6621 const int esz, const int msz, const int N, 6622 sve_ldst1_host_fn *host_fn, 6623 sve_ldst1_tlb_fn *tlb_fn) 6624 { 6625 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6626 int bit55 = extract64(addr, 55, 1); 6627 6628 /* Remove mtedesc from the normal sve descriptor. */ 6629 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6630 6631 /* Perform gross MTE suppression early. */ 6632 if (!tbi_check(mtedesc, bit55) || 6633 tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) { 6634 mtedesc = 0; 6635 } 6636 6637 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn); 6638 } 6639 6640 #define DO_STN_1(N, NAME, ESZ) \ 6641 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ 6642 target_ulong addr, uint32_t desc) \ 6643 { \ 6644 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ 6645 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6646 } \ 6647 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ 6648 target_ulong addr, uint32_t desc) \ 6649 { \ 6650 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ 6651 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ 6652 } 6653 6654 #define DO_STN_2(N, NAME, ESZ, MSZ) \ 6655 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ 6656 target_ulong addr, uint32_t desc) \ 6657 { \ 6658 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6659 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6660 } \ 6661 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ 6662 target_ulong addr, uint32_t desc) \ 6663 { \ 6664 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ 6665 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6666 } \ 6667 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ 6668 target_ulong addr, uint32_t desc) \ 6669 { \ 6670 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6671 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ 6672 } \ 6673 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ 6674 target_ulong addr, uint32_t desc) \ 6675 { \ 6676 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ 6677 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ 6678 } 6679 6680 DO_STN_1(1, bb, MO_8) 6681 DO_STN_1(1, bh, MO_16) 6682 DO_STN_1(1, bs, MO_32) 6683 DO_STN_1(1, bd, MO_64) 6684 DO_STN_1(2, bb, MO_8) 6685 DO_STN_1(3, bb, MO_8) 6686 DO_STN_1(4, bb, MO_8) 6687 6688 DO_STN_2(1, hh, MO_16, MO_16) 6689 DO_STN_2(1, hs, MO_32, MO_16) 6690 DO_STN_2(1, hd, MO_64, MO_16) 6691 DO_STN_2(2, hh, MO_16, MO_16) 6692 DO_STN_2(3, hh, MO_16, MO_16) 6693 DO_STN_2(4, hh, MO_16, MO_16) 6694 6695 DO_STN_2(1, ss, MO_32, MO_32) 6696 DO_STN_2(1, sd, MO_64, MO_32) 6697 DO_STN_2(2, ss, MO_32, MO_32) 6698 DO_STN_2(3, ss, MO_32, MO_32) 6699 DO_STN_2(4, ss, MO_32, MO_32) 6700 6701 DO_STN_2(1, dd, MO_64, MO_64) 6702 DO_STN_2(2, dd, MO_64, MO_64) 6703 DO_STN_2(3, dd, MO_64, MO_64) 6704 DO_STN_2(4, dd, MO_64, MO_64) 6705 6706 #undef DO_STN_1 6707 #undef DO_STN_2 6708 6709 /* 6710 * Loads with a vector index. 6711 */ 6712 6713 /* 6714 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed. 6715 */ 6716 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs); 6717 6718 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs) 6719 { 6720 return *(uint32_t *)(reg + H1_4(reg_ofs)); 6721 } 6722 6723 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs) 6724 { 6725 return *(int32_t *)(reg + H1_4(reg_ofs)); 6726 } 6727 6728 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs) 6729 { 6730 return (uint32_t)*(uint64_t *)(reg + reg_ofs); 6731 } 6732 6733 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs) 6734 { 6735 return (int32_t)*(uint64_t *)(reg + reg_ofs); 6736 } 6737 6738 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) 6739 { 6740 return *(uint64_t *)(reg + reg_ofs); 6741 } 6742 6743 static inline QEMU_ALWAYS_INLINE 6744 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6745 target_ulong base, uint32_t desc, uintptr_t retaddr, 6746 uint32_t mtedesc, int esize, int msize, 6747 zreg_off_fn *off_fn, 6748 sve_ldst1_host_fn *host_fn, 6749 sve_ldst1_tlb_fn *tlb_fn) 6750 { 6751 const int mmu_idx = arm_env_mmu_index(env); 6752 const intptr_t reg_max = simd_oprsz(desc); 6753 const int scale = simd_data(desc); 6754 ARMVectorReg scratch; 6755 intptr_t reg_off; 6756 SVEHostPage info, info2; 6757 6758 memset(&scratch, 0, reg_max); 6759 reg_off = 0; 6760 do { 6761 uint64_t pg = vg[reg_off >> 6]; 6762 do { 6763 if (likely(pg & 1)) { 6764 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 6765 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 6766 6767 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD, 6768 mmu_idx, retaddr); 6769 6770 if (likely(in_page >= msize)) { 6771 if (unlikely(info.flags & TLB_WATCHPOINT)) { 6772 cpu_check_watchpoint(env_cpu(env), addr, msize, 6773 info.attrs, BP_MEM_READ, retaddr); 6774 } 6775 if (mtedesc && info.tagged) { 6776 mte_check(env, mtedesc, addr, retaddr); 6777 } 6778 if (unlikely(info.flags & TLB_MMIO)) { 6779 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6780 } else { 6781 set_helper_retaddr(retaddr); 6782 host_fn(&scratch, reg_off, info.host); 6783 clear_helper_retaddr(); 6784 } 6785 } else { 6786 /* Element crosses the page boundary. */ 6787 sve_probe_page(&info2, false, env, addr + in_page, 0, 6788 MMU_DATA_LOAD, mmu_idx, retaddr); 6789 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) { 6790 cpu_check_watchpoint(env_cpu(env), addr, 6791 msize, info.attrs, 6792 BP_MEM_READ, retaddr); 6793 } 6794 if (mtedesc && info.tagged) { 6795 mte_check(env, mtedesc, addr, retaddr); 6796 } 6797 tlb_fn(env, &scratch, reg_off, addr, retaddr); 6798 } 6799 } 6800 reg_off += esize; 6801 pg >>= esize; 6802 } while (reg_off & 63); 6803 } while (reg_off < reg_max); 6804 6805 /* Wait until all exceptions have been raised to write back. */ 6806 memcpy(vd, &scratch, reg_max); 6807 } 6808 6809 static inline QEMU_ALWAYS_INLINE 6810 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6811 target_ulong base, uint32_t desc, uintptr_t retaddr, 6812 int esize, int msize, zreg_off_fn *off_fn, 6813 sve_ldst1_host_fn *host_fn, 6814 sve_ldst1_tlb_fn *tlb_fn) 6815 { 6816 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6817 /* Remove mtedesc from the normal sve descriptor. */ 6818 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 6819 6820 /* 6821 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 6822 * offset base entirely over the address space hole to change the 6823 * pointer tag, or change the bit55 selector. So we could here 6824 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 6825 */ 6826 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 6827 esize, msize, off_fn, host_fn, tlb_fn); 6828 } 6829 6830 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ 6831 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6832 void *vm, target_ulong base, uint32_t desc) \ 6833 { \ 6834 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 6835 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6836 } \ 6837 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6838 void *vm, target_ulong base, uint32_t desc) \ 6839 { \ 6840 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 6841 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6842 } 6843 6844 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ 6845 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 6846 void *vm, target_ulong base, uint32_t desc) \ 6847 { \ 6848 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 6849 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6850 } \ 6851 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 6852 void *vm, target_ulong base, uint32_t desc) \ 6853 { \ 6854 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 6855 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 6856 } 6857 6858 DO_LD1_ZPZ_S(bsu, zsu, MO_8) 6859 DO_LD1_ZPZ_S(bsu, zss, MO_8) 6860 DO_LD1_ZPZ_D(bdu, zsu, MO_8) 6861 DO_LD1_ZPZ_D(bdu, zss, MO_8) 6862 DO_LD1_ZPZ_D(bdu, zd, MO_8) 6863 6864 DO_LD1_ZPZ_S(bss, zsu, MO_8) 6865 DO_LD1_ZPZ_S(bss, zss, MO_8) 6866 DO_LD1_ZPZ_D(bds, zsu, MO_8) 6867 DO_LD1_ZPZ_D(bds, zss, MO_8) 6868 DO_LD1_ZPZ_D(bds, zd, MO_8) 6869 6870 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16) 6871 DO_LD1_ZPZ_S(hsu_le, zss, MO_16) 6872 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16) 6873 DO_LD1_ZPZ_D(hdu_le, zss, MO_16) 6874 DO_LD1_ZPZ_D(hdu_le, zd, MO_16) 6875 6876 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16) 6877 DO_LD1_ZPZ_S(hsu_be, zss, MO_16) 6878 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16) 6879 DO_LD1_ZPZ_D(hdu_be, zss, MO_16) 6880 DO_LD1_ZPZ_D(hdu_be, zd, MO_16) 6881 6882 DO_LD1_ZPZ_S(hss_le, zsu, MO_16) 6883 DO_LD1_ZPZ_S(hss_le, zss, MO_16) 6884 DO_LD1_ZPZ_D(hds_le, zsu, MO_16) 6885 DO_LD1_ZPZ_D(hds_le, zss, MO_16) 6886 DO_LD1_ZPZ_D(hds_le, zd, MO_16) 6887 6888 DO_LD1_ZPZ_S(hss_be, zsu, MO_16) 6889 DO_LD1_ZPZ_S(hss_be, zss, MO_16) 6890 DO_LD1_ZPZ_D(hds_be, zsu, MO_16) 6891 DO_LD1_ZPZ_D(hds_be, zss, MO_16) 6892 DO_LD1_ZPZ_D(hds_be, zd, MO_16) 6893 6894 DO_LD1_ZPZ_S(ss_le, zsu, MO_32) 6895 DO_LD1_ZPZ_S(ss_le, zss, MO_32) 6896 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32) 6897 DO_LD1_ZPZ_D(sdu_le, zss, MO_32) 6898 DO_LD1_ZPZ_D(sdu_le, zd, MO_32) 6899 6900 DO_LD1_ZPZ_S(ss_be, zsu, MO_32) 6901 DO_LD1_ZPZ_S(ss_be, zss, MO_32) 6902 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32) 6903 DO_LD1_ZPZ_D(sdu_be, zss, MO_32) 6904 DO_LD1_ZPZ_D(sdu_be, zd, MO_32) 6905 6906 DO_LD1_ZPZ_D(sds_le, zsu, MO_32) 6907 DO_LD1_ZPZ_D(sds_le, zss, MO_32) 6908 DO_LD1_ZPZ_D(sds_le, zd, MO_32) 6909 6910 DO_LD1_ZPZ_D(sds_be, zsu, MO_32) 6911 DO_LD1_ZPZ_D(sds_be, zss, MO_32) 6912 DO_LD1_ZPZ_D(sds_be, zd, MO_32) 6913 6914 DO_LD1_ZPZ_D(dd_le, zsu, MO_64) 6915 DO_LD1_ZPZ_D(dd_le, zss, MO_64) 6916 DO_LD1_ZPZ_D(dd_le, zd, MO_64) 6917 6918 DO_LD1_ZPZ_D(dd_be, zsu, MO_64) 6919 DO_LD1_ZPZ_D(dd_be, zss, MO_64) 6920 DO_LD1_ZPZ_D(dd_be, zd, MO_64) 6921 6922 #undef DO_LD1_ZPZ_S 6923 #undef DO_LD1_ZPZ_D 6924 6925 /* First fault loads with a vector index. */ 6926 6927 /* 6928 * Common helpers for all gather first-faulting loads. 6929 */ 6930 6931 static inline QEMU_ALWAYS_INLINE 6932 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 6933 target_ulong base, uint32_t desc, uintptr_t retaddr, 6934 uint32_t mtedesc, const int esz, const int msz, 6935 zreg_off_fn *off_fn, 6936 sve_ldst1_host_fn *host_fn, 6937 sve_ldst1_tlb_fn *tlb_fn) 6938 { 6939 const int mmu_idx = arm_env_mmu_index(env); 6940 const intptr_t reg_max = simd_oprsz(desc); 6941 const int scale = simd_data(desc); 6942 const int esize = 1 << esz; 6943 const int msize = 1 << msz; 6944 intptr_t reg_off; 6945 SVEHostPage info; 6946 target_ulong addr, in_page; 6947 ARMVectorReg scratch; 6948 6949 /* Skip to the first true predicate. */ 6950 reg_off = find_next_active(vg, 0, reg_max, esz); 6951 if (unlikely(reg_off >= reg_max)) { 6952 /* The entire predicate was false; no load occurs. */ 6953 memset(vd, 0, reg_max); 6954 return; 6955 } 6956 6957 /* Protect against overlap between vd and vm. */ 6958 if (unlikely(vd == vm)) { 6959 vm = memcpy(&scratch, vm, reg_max); 6960 } 6961 6962 /* 6963 * Probe the first element, allowing faults. 6964 */ 6965 addr = base + (off_fn(vm, reg_off) << scale); 6966 if (mtedesc) { 6967 mte_check(env, mtedesc, addr, retaddr); 6968 } 6969 tlb_fn(env, vd, reg_off, addr, retaddr); 6970 6971 /* After any fault, zero the other elements. */ 6972 swap_memzero(vd, reg_off); 6973 reg_off += esize; 6974 swap_memzero(vd + reg_off, reg_max - reg_off); 6975 6976 /* 6977 * Probe the remaining elements, not allowing faults. 6978 */ 6979 while (reg_off < reg_max) { 6980 uint64_t pg = vg[reg_off >> 6]; 6981 do { 6982 if (likely((pg >> (reg_off & 63)) & 1)) { 6983 addr = base + (off_fn(vm, reg_off) << scale); 6984 in_page = -(addr | TARGET_PAGE_MASK); 6985 6986 if (unlikely(in_page < msize)) { 6987 /* Stop if the element crosses a page boundary. */ 6988 goto fault; 6989 } 6990 6991 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD, 6992 mmu_idx, retaddr); 6993 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) { 6994 goto fault; 6995 } 6996 if (unlikely(info.flags & TLB_WATCHPOINT) && 6997 (cpu_watchpoint_address_matches 6998 (env_cpu(env), addr, msize) & BP_MEM_READ)) { 6999 goto fault; 7000 } 7001 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) { 7002 goto fault; 7003 } 7004 7005 set_helper_retaddr(retaddr); 7006 host_fn(vd, reg_off, info.host); 7007 clear_helper_retaddr(); 7008 } 7009 reg_off += esize; 7010 } while (reg_off & 63); 7011 } 7012 return; 7013 7014 fault: 7015 record_fault(env, reg_off, reg_max); 7016 } 7017 7018 static inline QEMU_ALWAYS_INLINE 7019 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7020 target_ulong base, uint32_t desc, uintptr_t retaddr, 7021 const int esz, const int msz, 7022 zreg_off_fn *off_fn, 7023 sve_ldst1_host_fn *host_fn, 7024 sve_ldst1_tlb_fn *tlb_fn) 7025 { 7026 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7027 /* Remove mtedesc from the normal sve descriptor. */ 7028 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7029 7030 /* 7031 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7032 * offset base entirely over the address space hole to change the 7033 * pointer tag, or change the bit55 selector. So we could here 7034 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7035 */ 7036 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7037 esz, msz, off_fn, host_fn, tlb_fn); 7038 } 7039 7040 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ 7041 void HELPER(sve_ldff##MEM##_##OFS) \ 7042 (CPUARMState *env, void *vd, void *vg, \ 7043 void *vm, target_ulong base, uint32_t desc) \ 7044 { \ 7045 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ 7046 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7047 } \ 7048 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7049 (CPUARMState *env, void *vd, void *vg, \ 7050 void *vm, target_ulong base, uint32_t desc) \ 7051 { \ 7052 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ 7053 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7054 } 7055 7056 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ 7057 void HELPER(sve_ldff##MEM##_##OFS) \ 7058 (CPUARMState *env, void *vd, void *vg, \ 7059 void *vm, target_ulong base, uint32_t desc) \ 7060 { \ 7061 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ 7062 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7063 } \ 7064 void HELPER(sve_ldff##MEM##_##OFS##_mte) \ 7065 (CPUARMState *env, void *vd, void *vg, \ 7066 void *vm, target_ulong base, uint32_t desc) \ 7067 { \ 7068 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ 7069 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ 7070 } 7071 7072 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) 7073 DO_LDFF1_ZPZ_S(bsu, zss, MO_8) 7074 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8) 7075 DO_LDFF1_ZPZ_D(bdu, zss, MO_8) 7076 DO_LDFF1_ZPZ_D(bdu, zd, MO_8) 7077 7078 DO_LDFF1_ZPZ_S(bss, zsu, MO_8) 7079 DO_LDFF1_ZPZ_S(bss, zss, MO_8) 7080 DO_LDFF1_ZPZ_D(bds, zsu, MO_8) 7081 DO_LDFF1_ZPZ_D(bds, zss, MO_8) 7082 DO_LDFF1_ZPZ_D(bds, zd, MO_8) 7083 7084 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16) 7085 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16) 7086 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16) 7087 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16) 7088 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16) 7089 7090 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16) 7091 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16) 7092 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16) 7093 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16) 7094 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16) 7095 7096 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16) 7097 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16) 7098 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16) 7099 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16) 7100 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16) 7101 7102 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16) 7103 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16) 7104 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16) 7105 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16) 7106 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16) 7107 7108 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32) 7109 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32) 7110 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32) 7111 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32) 7112 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32) 7113 7114 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32) 7115 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32) 7116 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32) 7117 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32) 7118 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32) 7119 7120 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32) 7121 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32) 7122 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32) 7123 7124 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32) 7125 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32) 7126 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32) 7127 7128 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64) 7129 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64) 7130 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64) 7131 7132 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64) 7133 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64) 7134 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) 7135 7136 /* Stores with a vector index. */ 7137 7138 static inline QEMU_ALWAYS_INLINE 7139 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7140 target_ulong base, uint32_t desc, uintptr_t retaddr, 7141 uint32_t mtedesc, int esize, int msize, 7142 zreg_off_fn *off_fn, 7143 sve_ldst1_host_fn *host_fn, 7144 sve_ldst1_tlb_fn *tlb_fn) 7145 { 7146 const int mmu_idx = arm_env_mmu_index(env); 7147 const intptr_t reg_max = simd_oprsz(desc); 7148 const int scale = simd_data(desc); 7149 void *host[ARM_MAX_VQ * 4]; 7150 intptr_t reg_off, i; 7151 SVEHostPage info, info2; 7152 7153 /* 7154 * Probe all of the elements for host addresses and flags. 7155 */ 7156 i = reg_off = 0; 7157 do { 7158 uint64_t pg = vg[reg_off >> 6]; 7159 do { 7160 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7161 target_ulong in_page = -(addr | TARGET_PAGE_MASK); 7162 7163 host[i] = NULL; 7164 if (likely((pg >> (reg_off & 63)) & 1)) { 7165 if (likely(in_page >= msize)) { 7166 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE, 7167 mmu_idx, retaddr); 7168 if (!(info.flags & TLB_MMIO)) { 7169 host[i] = info.host; 7170 } 7171 } else { 7172 /* 7173 * Element crosses the page boundary. 7174 * Probe both pages, but do not record the host address, 7175 * so that we use the slow path. 7176 */ 7177 sve_probe_page(&info, false, env, addr, 0, 7178 MMU_DATA_STORE, mmu_idx, retaddr); 7179 sve_probe_page(&info2, false, env, addr + in_page, 0, 7180 MMU_DATA_STORE, mmu_idx, retaddr); 7181 info.flags |= info2.flags; 7182 } 7183 7184 if (unlikely(info.flags & TLB_WATCHPOINT)) { 7185 cpu_check_watchpoint(env_cpu(env), addr, msize, 7186 info.attrs, BP_MEM_WRITE, retaddr); 7187 } 7188 7189 if (mtedesc && info.tagged) { 7190 mte_check(env, mtedesc, addr, retaddr); 7191 } 7192 } 7193 i += 1; 7194 reg_off += esize; 7195 } while (reg_off & 63); 7196 } while (reg_off < reg_max); 7197 7198 /* 7199 * Now that we have recognized all exceptions except SyncExternal 7200 * (from TLB_MMIO), which we cannot avoid, perform all of the stores. 7201 * 7202 * Note for the common case of an element in RAM, not crossing a page 7203 * boundary, we have stored the host address in host[]. This doubles 7204 * as a first-level check against the predicate, since only enabled 7205 * elements have non-null host addresses. 7206 */ 7207 i = reg_off = 0; 7208 do { 7209 void *h = host[i]; 7210 if (likely(h != NULL)) { 7211 set_helper_retaddr(retaddr); 7212 host_fn(vd, reg_off, h); 7213 clear_helper_retaddr(); 7214 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) { 7215 target_ulong addr = base + (off_fn(vm, reg_off) << scale); 7216 tlb_fn(env, vd, reg_off, addr, retaddr); 7217 } 7218 i += 1; 7219 reg_off += esize; 7220 } while (reg_off < reg_max); 7221 } 7222 7223 static inline QEMU_ALWAYS_INLINE 7224 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, 7225 target_ulong base, uint32_t desc, uintptr_t retaddr, 7226 int esize, int msize, zreg_off_fn *off_fn, 7227 sve_ldst1_host_fn *host_fn, 7228 sve_ldst1_tlb_fn *tlb_fn) 7229 { 7230 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7231 /* Remove mtedesc from the normal sve descriptor. */ 7232 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); 7233 7234 /* 7235 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot 7236 * offset base entirely over the address space hole to change the 7237 * pointer tag, or change the bit55 selector. So we could here 7238 * examine TBI + TCMA like we do for sve_ldN_r_mte(). 7239 */ 7240 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, 7241 esize, msize, off_fn, host_fn, tlb_fn); 7242 } 7243 7244 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ 7245 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7246 void *vm, target_ulong base, uint32_t desc) \ 7247 { \ 7248 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ 7249 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7250 } \ 7251 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7252 void *vm, target_ulong base, uint32_t desc) \ 7253 { \ 7254 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ 7255 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7256 } 7257 7258 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ 7259 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ 7260 void *vm, target_ulong base, uint32_t desc) \ 7261 { \ 7262 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ 7263 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7264 } \ 7265 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ 7266 void *vm, target_ulong base, uint32_t desc) \ 7267 { \ 7268 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ 7269 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ 7270 } 7271 7272 DO_ST1_ZPZ_S(bs, zsu, MO_8) 7273 DO_ST1_ZPZ_S(hs_le, zsu, MO_16) 7274 DO_ST1_ZPZ_S(hs_be, zsu, MO_16) 7275 DO_ST1_ZPZ_S(ss_le, zsu, MO_32) 7276 DO_ST1_ZPZ_S(ss_be, zsu, MO_32) 7277 7278 DO_ST1_ZPZ_S(bs, zss, MO_8) 7279 DO_ST1_ZPZ_S(hs_le, zss, MO_16) 7280 DO_ST1_ZPZ_S(hs_be, zss, MO_16) 7281 DO_ST1_ZPZ_S(ss_le, zss, MO_32) 7282 DO_ST1_ZPZ_S(ss_be, zss, MO_32) 7283 7284 DO_ST1_ZPZ_D(bd, zsu, MO_8) 7285 DO_ST1_ZPZ_D(hd_le, zsu, MO_16) 7286 DO_ST1_ZPZ_D(hd_be, zsu, MO_16) 7287 DO_ST1_ZPZ_D(sd_le, zsu, MO_32) 7288 DO_ST1_ZPZ_D(sd_be, zsu, MO_32) 7289 DO_ST1_ZPZ_D(dd_le, zsu, MO_64) 7290 DO_ST1_ZPZ_D(dd_be, zsu, MO_64) 7291 7292 DO_ST1_ZPZ_D(bd, zss, MO_8) 7293 DO_ST1_ZPZ_D(hd_le, zss, MO_16) 7294 DO_ST1_ZPZ_D(hd_be, zss, MO_16) 7295 DO_ST1_ZPZ_D(sd_le, zss, MO_32) 7296 DO_ST1_ZPZ_D(sd_be, zss, MO_32) 7297 DO_ST1_ZPZ_D(dd_le, zss, MO_64) 7298 DO_ST1_ZPZ_D(dd_be, zss, MO_64) 7299 7300 DO_ST1_ZPZ_D(bd, zd, MO_8) 7301 DO_ST1_ZPZ_D(hd_le, zd, MO_16) 7302 DO_ST1_ZPZ_D(hd_be, zd, MO_16) 7303 DO_ST1_ZPZ_D(sd_le, zd, MO_32) 7304 DO_ST1_ZPZ_D(sd_be, zd, MO_32) 7305 DO_ST1_ZPZ_D(dd_le, zd, MO_64) 7306 DO_ST1_ZPZ_D(dd_be, zd, MO_64) 7307 7308 #undef DO_ST1_ZPZ_S 7309 #undef DO_ST1_ZPZ_D 7310 7311 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7312 { 7313 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7314 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7315 7316 for (i = 0; i < opr_sz; ++i) { 7317 d[i] = n[i] ^ m[i] ^ k[i]; 7318 } 7319 } 7320 7321 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7322 { 7323 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7324 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7325 7326 for (i = 0; i < opr_sz; ++i) { 7327 d[i] = n[i] ^ (m[i] & ~k[i]); 7328 } 7329 } 7330 7331 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7332 { 7333 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7334 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7335 7336 for (i = 0; i < opr_sz; ++i) { 7337 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]); 7338 } 7339 } 7340 7341 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7342 { 7343 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7344 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7345 7346 for (i = 0; i < opr_sz; ++i) { 7347 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]); 7348 } 7349 } 7350 7351 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc) 7352 { 7353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7354 uint64_t *d = vd, *n = vn, *m = vm, *k = vk; 7355 7356 for (i = 0; i < opr_sz; ++i) { 7357 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i])); 7358 } 7359 } 7360 7361 /* 7362 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n. 7363 * See hasless(v,1) from 7364 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord 7365 */ 7366 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 7367 { 7368 int bits = 8 << esz; 7369 uint64_t ones = dup_const(esz, 1); 7370 uint64_t signs = ones << (bits - 1); 7371 uint64_t cmp0, cmp1; 7372 7373 cmp1 = dup_const(esz, n); 7374 cmp0 = cmp1 ^ m0; 7375 cmp1 = cmp1 ^ m1; 7376 cmp0 = (cmp0 - ones) & ~cmp0; 7377 cmp1 = (cmp1 - ones) & ~cmp1; 7378 return (cmp0 | cmp1) & signs; 7379 } 7380 7381 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg, 7382 uint32_t desc, int esz, bool nmatch) 7383 { 7384 uint16_t esz_mask = pred_esz_masks[esz]; 7385 intptr_t opr_sz = simd_oprsz(desc); 7386 uint32_t flags = PREDTEST_INIT; 7387 intptr_t i, j, k; 7388 7389 for (i = 0; i < opr_sz; i += 16) { 7390 uint64_t m0 = *(uint64_t *)(vm + i); 7391 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7392 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask; 7393 uint16_t out = 0; 7394 7395 for (j = 0; j < 16; j += 8) { 7396 uint64_t n = *(uint64_t *)(vn + i + j); 7397 7398 for (k = 0; k < 8; k += 1 << esz) { 7399 if (pg & (1 << (j + k))) { 7400 bool o = do_match2(n >> (k * 8), m0, m1, esz); 7401 out |= (o ^ nmatch) << (j + k); 7402 } 7403 } 7404 } 7405 *(uint16_t *)(vd + H1_2(i >> 3)) = out; 7406 flags = iter_predtest_fwd(out, pg, flags); 7407 } 7408 return flags; 7409 } 7410 7411 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \ 7412 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ 7413 { \ 7414 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \ 7415 } 7416 7417 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false) 7418 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false) 7419 7420 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true) 7421 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true) 7422 7423 #undef DO_PPZZ_MATCH 7424 7425 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg, 7426 uint32_t desc) 7427 { 7428 ARMVectorReg scratch; 7429 intptr_t i, j; 7430 intptr_t opr_sz = simd_oprsz(desc); 7431 uint32_t *d = vd, *n = vn, *m = vm; 7432 uint8_t *pg = vg; 7433 7434 if (d == n) { 7435 n = memcpy(&scratch, n, opr_sz); 7436 if (d == m) { 7437 m = n; 7438 } 7439 } else if (d == m) { 7440 m = memcpy(&scratch, m, opr_sz); 7441 } 7442 7443 for (i = 0; i < opr_sz; i += 4) { 7444 uint64_t count = 0; 7445 uint8_t pred; 7446 7447 pred = pg[H1(i >> 3)] >> (i & 7); 7448 if (pred & 1) { 7449 uint32_t nn = n[H4(i >> 2)]; 7450 7451 for (j = 0; j <= i; j += 4) { 7452 pred = pg[H1(j >> 3)] >> (j & 7); 7453 if ((pred & 1) && nn == m[H4(j >> 2)]) { 7454 ++count; 7455 } 7456 } 7457 } 7458 d[H4(i >> 2)] = count; 7459 } 7460 } 7461 7462 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg, 7463 uint32_t desc) 7464 { 7465 ARMVectorReg scratch; 7466 intptr_t i, j; 7467 intptr_t opr_sz = simd_oprsz(desc); 7468 uint64_t *d = vd, *n = vn, *m = vm; 7469 uint8_t *pg = vg; 7470 7471 if (d == n) { 7472 n = memcpy(&scratch, n, opr_sz); 7473 if (d == m) { 7474 m = n; 7475 } 7476 } else if (d == m) { 7477 m = memcpy(&scratch, m, opr_sz); 7478 } 7479 7480 for (i = 0; i < opr_sz / 8; ++i) { 7481 uint64_t count = 0; 7482 if (pg[H1(i)] & 1) { 7483 uint64_t nn = n[i]; 7484 for (j = 0; j <= i; ++j) { 7485 if ((pg[H1(j)] & 1) && nn == m[j]) { 7486 ++count; 7487 } 7488 } 7489 } 7490 d[i] = count; 7491 } 7492 } 7493 7494 /* 7495 * Returns the number of bytes in m0 and m1 that match n. 7496 * Unlike do_match2 we don't just need true/false, we need an exact count. 7497 * This requires two extra logical operations. 7498 */ 7499 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1) 7500 { 7501 const uint64_t mask = dup_const(MO_8, 0x7f); 7502 uint64_t cmp0, cmp1; 7503 7504 cmp1 = dup_const(MO_8, n); 7505 cmp0 = cmp1 ^ m0; 7506 cmp1 = cmp1 ^ m1; 7507 7508 /* 7509 * 1: clear msb of each byte to avoid carry to next byte (& mask) 7510 * 2: carry in to msb if byte != 0 (+ mask) 7511 * 3: set msb if cmp has msb set (| cmp) 7512 * 4: set ~msb to ignore them (| mask) 7513 * We now have 0xff for byte != 0 or 0x7f for byte == 0. 7514 * 5: invert, resulting in 0x80 if and only if byte == 0. 7515 */ 7516 cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask); 7517 cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask); 7518 7519 /* 7520 * Combine the two compares in a way that the bits do 7521 * not overlap, and so preserves the count of set bits. 7522 * If the host has an efficient instruction for ctpop, 7523 * then ctpop(x) + ctpop(y) has the same number of 7524 * operations as ctpop(x | (y >> 1)). If the host does 7525 * not have an efficient ctpop, then we only want to 7526 * use it once. 7527 */ 7528 return ctpop64(cmp0 | (cmp1 >> 1)); 7529 } 7530 7531 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) 7532 { 7533 intptr_t i, j; 7534 intptr_t opr_sz = simd_oprsz(desc); 7535 7536 for (i = 0; i < opr_sz; i += 16) { 7537 uint64_t n0 = *(uint64_t *)(vn + i); 7538 uint64_t m0 = *(uint64_t *)(vm + i); 7539 uint64_t n1 = *(uint64_t *)(vn + i + 8); 7540 uint64_t m1 = *(uint64_t *)(vm + i + 8); 7541 uint64_t out0 = 0; 7542 uint64_t out1 = 0; 7543 7544 for (j = 0; j < 64; j += 8) { 7545 uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1); 7546 uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1); 7547 out0 |= cnt0 << j; 7548 out1 |= cnt1 << j; 7549 } 7550 7551 *(uint64_t *)(vd + i) = out0; 7552 *(uint64_t *)(vd + i + 8) = out1; 7553 } 7554 } 7555 7556 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc) 7557 { 7558 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7559 int shr = simd_data(desc); 7560 int shl = 8 - shr; 7561 uint64_t mask = dup_const(MO_8, 0xff >> shr); 7562 uint64_t *d = vd, *n = vn, *m = vm; 7563 7564 for (i = 0; i < opr_sz; ++i) { 7565 uint64_t t = n[i] ^ m[i]; 7566 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7567 } 7568 } 7569 7570 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc) 7571 { 7572 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 7573 int shr = simd_data(desc); 7574 int shl = 16 - shr; 7575 uint64_t mask = dup_const(MO_16, 0xffff >> shr); 7576 uint64_t *d = vd, *n = vn, *m = vm; 7577 7578 for (i = 0; i < opr_sz; ++i) { 7579 uint64_t t = n[i] ^ m[i]; 7580 d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask); 7581 } 7582 } 7583 7584 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc) 7585 { 7586 intptr_t i, opr_sz = simd_oprsz(desc) / 4; 7587 int shr = simd_data(desc); 7588 uint32_t *d = vd, *n = vn, *m = vm; 7589 7590 for (i = 0; i < opr_sz; ++i) { 7591 d[i] = ror32(n[i] ^ m[i], shr); 7592 } 7593 } 7594 7595 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va, 7596 float_status *status, uint32_t desc) 7597 { 7598 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4); 7599 7600 for (s = 0; s < opr_sz; ++s) { 7601 float32 *n = vn + s * sizeof(float32) * 4; 7602 float32 *m = vm + s * sizeof(float32) * 4; 7603 float32 *a = va + s * sizeof(float32) * 4; 7604 float32 *d = vd + s * sizeof(float32) * 4; 7605 float32 n00 = n[H4(0)], n01 = n[H4(1)]; 7606 float32 n10 = n[H4(2)], n11 = n[H4(3)]; 7607 float32 m00 = m[H4(0)], m01 = m[H4(1)]; 7608 float32 m10 = m[H4(2)], m11 = m[H4(3)]; 7609 float32 p0, p1; 7610 7611 /* i = 0, j = 0 */ 7612 p0 = float32_mul(n00, m00, status); 7613 p1 = float32_mul(n01, m01, status); 7614 d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status); 7615 7616 /* i = 0, j = 1 */ 7617 p0 = float32_mul(n00, m10, status); 7618 p1 = float32_mul(n01, m11, status); 7619 d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status); 7620 7621 /* i = 1, j = 0 */ 7622 p0 = float32_mul(n10, m00, status); 7623 p1 = float32_mul(n11, m01, status); 7624 d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status); 7625 7626 /* i = 1, j = 1 */ 7627 p0 = float32_mul(n10, m10, status); 7628 p1 = float32_mul(n11, m11, status); 7629 d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status); 7630 } 7631 } 7632 7633 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va, 7634 float_status *status, uint32_t desc) 7635 { 7636 intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4); 7637 7638 for (s = 0; s < opr_sz; ++s) { 7639 float64 *n = vn + s * sizeof(float64) * 4; 7640 float64 *m = vm + s * sizeof(float64) * 4; 7641 float64 *a = va + s * sizeof(float64) * 4; 7642 float64 *d = vd + s * sizeof(float64) * 4; 7643 float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; 7644 float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; 7645 float64 p0, p1; 7646 7647 /* i = 0, j = 0 */ 7648 p0 = float64_mul(n00, m00, status); 7649 p1 = float64_mul(n01, m01, status); 7650 d[0] = float64_add(a[0], float64_add(p0, p1, status), status); 7651 7652 /* i = 0, j = 1 */ 7653 p0 = float64_mul(n00, m10, status); 7654 p1 = float64_mul(n01, m11, status); 7655 d[1] = float64_add(a[1], float64_add(p0, p1, status), status); 7656 7657 /* i = 1, j = 0 */ 7658 p0 = float64_mul(n10, m00, status); 7659 p1 = float64_mul(n11, m01, status); 7660 d[2] = float64_add(a[2], float64_add(p0, p1, status), status); 7661 7662 /* i = 1, j = 1 */ 7663 p0 = float64_mul(n10, m10, status); 7664 p1 = float64_mul(n11, m11, status); 7665 d[3] = float64_add(a[3], float64_add(p0, p1, status), status); 7666 } 7667 } 7668 7669 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7670 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7671 float_status *status, uint32_t desc) \ 7672 { \ 7673 intptr_t i = simd_oprsz(desc); \ 7674 uint64_t *g = vg; \ 7675 do { \ 7676 uint64_t pg = g[(i - 1) >> 6]; \ 7677 do { \ 7678 i -= sizeof(TYPEW); \ 7679 if (likely((pg >> (i & 63)) & 1)) { \ 7680 TYPEW nn = *(TYPEW *)(vn + HW(i)); \ 7681 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \ 7682 } \ 7683 } while (i & 63); \ 7684 } while (i != 0); \ 7685 } 7686 7687 DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16) 7688 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16) 7689 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32) 7690 7691 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \ 7692 void HELPER(NAME)(void *vd, void *vn, void *vg, \ 7693 float_status *status, uint32_t desc) \ 7694 { \ 7695 intptr_t i = simd_oprsz(desc); \ 7696 uint64_t *g = vg; \ 7697 do { \ 7698 uint64_t pg = g[(i - 1) >> 6]; \ 7699 do { \ 7700 i -= sizeof(TYPEW); \ 7701 if (likely((pg >> (i & 63)) & 1)) { \ 7702 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \ 7703 *(TYPEW *)(vd + HW(i)) = OP(nn, status); \ 7704 } \ 7705 } while (i & 63); \ 7706 } while (i != 0); \ 7707 } 7708 7709 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32) 7710 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64) 7711 7712 #undef DO_FCVTLT 7713 #undef DO_FCVTNT 7714