1 /* 2 * M-profile MVE Operations 3 * 4 * Copyright (c) 2021 Linaro, Ltd. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "internals.h" 23 #include "vec_internal.h" 24 #include "exec/helper-proto.h" 25 #include "exec/cpu_ldst.h" 26 #include "exec/exec-all.h" 27 #include "tcg/tcg.h" 28 29 static uint16_t mve_eci_mask(CPUARMState *env) 30 { 31 /* 32 * Return the mask of which elements in the MVE vector correspond 33 * to beats being executed. The mask has 1 bits for executed lanes 34 * and 0 bits where ECI says this beat was already executed. 35 */ 36 int eci; 37 38 if ((env->condexec_bits & 0xf) != 0) { 39 return 0xffff; 40 } 41 42 eci = env->condexec_bits >> 4; 43 switch (eci) { 44 case ECI_NONE: 45 return 0xffff; 46 case ECI_A0: 47 return 0xfff0; 48 case ECI_A0A1: 49 return 0xff00; 50 case ECI_A0A1A2: 51 case ECI_A0A1A2B0: 52 return 0xf000; 53 default: 54 g_assert_not_reached(); 55 } 56 } 57 58 static uint16_t mve_element_mask(CPUARMState *env) 59 { 60 /* 61 * Return the mask of which elements in the MVE vector should be 62 * updated. This is a combination of multiple things: 63 * (1) by default, we update every lane in the vector 64 * (2) VPT predication stores its state in the VPR register; 65 * (3) low-overhead-branch tail predication will mask out part 66 * the vector on the final iteration of the loop 67 * (4) if EPSR.ECI is set then we must execute only some beats 68 * of the insn 69 * We combine all these into a 16-bit result with the same semantics 70 * as VPR.P0: 0 to mask the lane, 1 if it is active. 71 * 8-bit vector ops will look at all bits of the result; 72 * 16-bit ops will look at bits 0, 2, 4, ...; 73 * 32-bit ops will look at bits 0, 4, 8 and 12. 74 * Compare pseudocode GetCurInstrBeat(), though that only returns 75 * the 4-bit slice of the mask corresponding to a single beat. 76 */ 77 uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0); 78 79 if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) { 80 mask |= 0xff; 81 } 82 if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) { 83 mask |= 0xff00; 84 } 85 86 if (env->v7m.ltpsize < 4 && 87 env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) { 88 /* 89 * Tail predication active, and this is the last loop iteration. 90 * The element size is (1 << ltpsize), and we only want to process 91 * loopcount elements, so we want to retain the least significant 92 * (loopcount * esize) predicate bits and zero out bits above that. 93 */ 94 int masklen = env->regs[14] << env->v7m.ltpsize; 95 assert(masklen <= 16); 96 uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0; 97 mask &= ltpmask; 98 } 99 100 /* 101 * ECI bits indicate which beats are already executed; 102 * we handle this by effectively predicating them out. 103 */ 104 mask &= mve_eci_mask(env); 105 return mask; 106 } 107 108 static void mve_advance_vpt(CPUARMState *env) 109 { 110 /* Advance the VPT and ECI state if necessary */ 111 uint32_t vpr = env->v7m.vpr; 112 unsigned mask01, mask23; 113 114 if ((env->condexec_bits & 0xf) == 0) { 115 env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ? 116 (ECI_A0 << 4) : (ECI_NONE << 4); 117 } 118 119 if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) { 120 /* VPT not enabled, nothing to do */ 121 return; 122 } 123 124 mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01); 125 mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23); 126 if (mask01 > 8) { 127 /* high bit set, but not 0b1000: invert the relevant half of P0 */ 128 vpr ^= 0xff; 129 } 130 if (mask23 > 8) { 131 /* high bit set, but not 0b1000: invert the relevant half of P0 */ 132 vpr ^= 0xff00; 133 } 134 vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1); 135 vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1); 136 env->v7m.vpr = vpr; 137 } 138 139 140 #define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \ 141 void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ 142 { \ 143 TYPE *d = vd; \ 144 uint16_t mask = mve_element_mask(env); \ 145 unsigned b, e; \ 146 /* \ 147 * R_SXTM allows the dest reg to become UNKNOWN for abandoned \ 148 * beats so we don't care if we update part of the dest and \ 149 * then take an exception. \ 150 */ \ 151 for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ 152 if (mask & (1 << b)) { \ 153 d[H##ESIZE(e)] = cpu_##LDTYPE##_data_ra(env, addr, GETPC()); \ 154 } \ 155 addr += MSIZE; \ 156 } \ 157 mve_advance_vpt(env); \ 158 } 159 160 #define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \ 161 void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \ 162 { \ 163 TYPE *d = vd; \ 164 uint16_t mask = mve_element_mask(env); \ 165 unsigned b, e; \ 166 for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \ 167 if (mask & (1 << b)) { \ 168 cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ 169 } \ 170 addr += MSIZE; \ 171 } \ 172 mve_advance_vpt(env); \ 173 } 174 175 DO_VLDR(vldrb, 1, ldub, 1, uint8_t) 176 DO_VLDR(vldrh, 2, lduw, 2, uint16_t) 177 DO_VLDR(vldrw, 4, ldl, 4, uint32_t) 178 179 DO_VSTR(vstrb, 1, stb, 1, uint8_t) 180 DO_VSTR(vstrh, 2, stw, 2, uint16_t) 181 DO_VSTR(vstrw, 4, stl, 4, uint32_t) 182 183 DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t) 184 DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t) 185 DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t) 186 DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t) 187 DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t) 188 DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t) 189 190 DO_VSTR(vstrb_h, 1, stb, 2, int16_t) 191 DO_VSTR(vstrb_w, 1, stb, 4, int32_t) 192 DO_VSTR(vstrh_w, 2, stw, 4, int32_t) 193 194 #undef DO_VLDR 195 #undef DO_VSTR 196 197 /* 198 * The mergemask(D, R, M) macro performs the operation "*D = R" but 199 * storing only the bytes which correspond to 1 bits in M, 200 * leaving other bytes in *D unchanged. We use _Generic 201 * to select the correct implementation based on the type of D. 202 */ 203 204 static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask) 205 { 206 if (mask & 1) { 207 *d = r; 208 } 209 } 210 211 static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask) 212 { 213 mergemask_ub((uint8_t *)d, r, mask); 214 } 215 216 static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask) 217 { 218 uint16_t bmask = expand_pred_b_data[mask & 3]; 219 *d = (*d & ~bmask) | (r & bmask); 220 } 221 222 static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask) 223 { 224 mergemask_uh((uint16_t *)d, r, mask); 225 } 226 227 static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask) 228 { 229 uint32_t bmask = expand_pred_b_data[mask & 0xf]; 230 *d = (*d & ~bmask) | (r & bmask); 231 } 232 233 static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask) 234 { 235 mergemask_uw((uint32_t *)d, r, mask); 236 } 237 238 static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask) 239 { 240 uint64_t bmask = expand_pred_b_data[mask & 0xff]; 241 *d = (*d & ~bmask) | (r & bmask); 242 } 243 244 static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask) 245 { 246 mergemask_uq((uint64_t *)d, r, mask); 247 } 248 249 #define mergemask(D, R, M) \ 250 _Generic(D, \ 251 uint8_t *: mergemask_ub, \ 252 int8_t *: mergemask_sb, \ 253 uint16_t *: mergemask_uh, \ 254 int16_t *: mergemask_sh, \ 255 uint32_t *: mergemask_uw, \ 256 int32_t *: mergemask_sw, \ 257 uint64_t *: mergemask_uq, \ 258 int64_t *: mergemask_sq)(D, R, M) 259 260 void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val) 261 { 262 /* 263 * The generated code already replicated an 8 or 16 bit constant 264 * into the 32-bit value, so we only need to write the 32-bit 265 * value to all elements of the Qreg, allowing for predication. 266 */ 267 uint32_t *d = vd; 268 uint16_t mask = mve_element_mask(env); 269 unsigned e; 270 for (e = 0; e < 16 / 4; e++, mask >>= 4) { 271 mergemask(&d[H4(e)], val, mask); 272 } 273 mve_advance_vpt(env); 274 } 275 276 #define DO_1OP(OP, ESIZE, TYPE, FN) \ 277 void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \ 278 { \ 279 TYPE *d = vd, *m = vm; \ 280 uint16_t mask = mve_element_mask(env); \ 281 unsigned e; \ 282 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 283 mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \ 284 } \ 285 mve_advance_vpt(env); \ 286 } 287 288 #define DO_CLS_B(N) (clrsb32(N) - 24) 289 #define DO_CLS_H(N) (clrsb32(N) - 16) 290 291 DO_1OP(vclsb, 1, int8_t, DO_CLS_B) 292 DO_1OP(vclsh, 2, int16_t, DO_CLS_H) 293 DO_1OP(vclsw, 4, int32_t, clrsb32) 294 295 #define DO_CLZ_B(N) (clz32(N) - 24) 296 #define DO_CLZ_H(N) (clz32(N) - 16) 297 298 DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B) 299 DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H) 300 DO_1OP(vclzw, 4, uint32_t, clz32) 301 302 DO_1OP(vrev16b, 2, uint16_t, bswap16) 303 DO_1OP(vrev32b, 4, uint32_t, bswap32) 304 DO_1OP(vrev32h, 4, uint32_t, hswap32) 305 DO_1OP(vrev64b, 8, uint64_t, bswap64) 306 DO_1OP(vrev64h, 8, uint64_t, hswap64) 307 DO_1OP(vrev64w, 8, uint64_t, wswap64) 308 309 #define DO_NOT(N) (~(N)) 310 311 DO_1OP(vmvn, 8, uint64_t, DO_NOT) 312 313 #define DO_ABS(N) ((N) < 0 ? -(N) : (N)) 314 #define DO_FABSH(N) ((N) & dup_const(MO_16, 0x7fff)) 315 #define DO_FABSS(N) ((N) & dup_const(MO_32, 0x7fffffff)) 316 317 DO_1OP(vabsb, 1, int8_t, DO_ABS) 318 DO_1OP(vabsh, 2, int16_t, DO_ABS) 319 DO_1OP(vabsw, 4, int32_t, DO_ABS) 320 321 /* We can do these 64 bits at a time */ 322 DO_1OP(vfabsh, 8, uint64_t, DO_FABSH) 323 DO_1OP(vfabss, 8, uint64_t, DO_FABSS) 324 325 #define DO_NEG(N) (-(N)) 326 #define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000)) 327 #define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000)) 328 329 DO_1OP(vnegb, 1, int8_t, DO_NEG) 330 DO_1OP(vnegh, 2, int16_t, DO_NEG) 331 DO_1OP(vnegw, 4, int32_t, DO_NEG) 332 333 /* We can do these 64 bits at a time */ 334 DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH) 335 DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS) 336 337 /* 338 * 1 operand immediates: Vda is destination and possibly also one source. 339 * All these insns work at 64-bit widths. 340 */ 341 #define DO_1OP_IMM(OP, FN) \ 342 void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm) \ 343 { \ 344 uint64_t *da = vda; \ 345 uint16_t mask = mve_element_mask(env); \ 346 unsigned e; \ 347 for (e = 0; e < 16 / 8; e++, mask >>= 8) { \ 348 mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask); \ 349 } \ 350 mve_advance_vpt(env); \ 351 } 352 353 #define DO_MOVI(N, I) (I) 354 #define DO_ANDI(N, I) ((N) & (I)) 355 #define DO_ORRI(N, I) ((N) | (I)) 356 357 DO_1OP_IMM(vmovi, DO_MOVI) 358 DO_1OP_IMM(vandi, DO_ANDI) 359 DO_1OP_IMM(vorri, DO_ORRI) 360 361 #define DO_2OP(OP, ESIZE, TYPE, FN) \ 362 void HELPER(glue(mve_, OP))(CPUARMState *env, \ 363 void *vd, void *vn, void *vm) \ 364 { \ 365 TYPE *d = vd, *n = vn, *m = vm; \ 366 uint16_t mask = mve_element_mask(env); \ 367 unsigned e; \ 368 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 369 mergemask(&d[H##ESIZE(e)], \ 370 FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \ 371 } \ 372 mve_advance_vpt(env); \ 373 } 374 375 /* provide unsigned 2-op helpers for all sizes */ 376 #define DO_2OP_U(OP, FN) \ 377 DO_2OP(OP##b, 1, uint8_t, FN) \ 378 DO_2OP(OP##h, 2, uint16_t, FN) \ 379 DO_2OP(OP##w, 4, uint32_t, FN) 380 381 /* provide signed 2-op helpers for all sizes */ 382 #define DO_2OP_S(OP, FN) \ 383 DO_2OP(OP##b, 1, int8_t, FN) \ 384 DO_2OP(OP##h, 2, int16_t, FN) \ 385 DO_2OP(OP##w, 4, int32_t, FN) 386 387 /* 388 * "Long" operations where two half-sized inputs (taken from either the 389 * top or the bottom of the input vector) produce a double-width result. 390 * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output. 391 */ 392 #define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ 393 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ 394 { \ 395 LTYPE *d = vd; \ 396 TYPE *n = vn, *m = vm; \ 397 uint16_t mask = mve_element_mask(env); \ 398 unsigned le; \ 399 for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ 400 LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], \ 401 m[H##ESIZE(le * 2 + TOP)]); \ 402 mergemask(&d[H##LESIZE(le)], r, mask); \ 403 } \ 404 mve_advance_vpt(env); \ 405 } 406 407 #define DO_2OP_SAT(OP, ESIZE, TYPE, FN) \ 408 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ 409 { \ 410 TYPE *d = vd, *n = vn, *m = vm; \ 411 uint16_t mask = mve_element_mask(env); \ 412 unsigned e; \ 413 bool qc = false; \ 414 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 415 bool sat = false; \ 416 TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \ 417 mergemask(&d[H##ESIZE(e)], r, mask); \ 418 qc |= sat & mask & 1; \ 419 } \ 420 if (qc) { \ 421 env->vfp.qc[0] = qc; \ 422 } \ 423 mve_advance_vpt(env); \ 424 } 425 426 /* provide unsigned 2-op helpers for all sizes */ 427 #define DO_2OP_SAT_U(OP, FN) \ 428 DO_2OP_SAT(OP##b, 1, uint8_t, FN) \ 429 DO_2OP_SAT(OP##h, 2, uint16_t, FN) \ 430 DO_2OP_SAT(OP##w, 4, uint32_t, FN) 431 432 /* provide signed 2-op helpers for all sizes */ 433 #define DO_2OP_SAT_S(OP, FN) \ 434 DO_2OP_SAT(OP##b, 1, int8_t, FN) \ 435 DO_2OP_SAT(OP##h, 2, int16_t, FN) \ 436 DO_2OP_SAT(OP##w, 4, int32_t, FN) 437 438 #define DO_AND(N, M) ((N) & (M)) 439 #define DO_BIC(N, M) ((N) & ~(M)) 440 #define DO_ORR(N, M) ((N) | (M)) 441 #define DO_ORN(N, M) ((N) | ~(M)) 442 #define DO_EOR(N, M) ((N) ^ (M)) 443 444 DO_2OP(vand, 8, uint64_t, DO_AND) 445 DO_2OP(vbic, 8, uint64_t, DO_BIC) 446 DO_2OP(vorr, 8, uint64_t, DO_ORR) 447 DO_2OP(vorn, 8, uint64_t, DO_ORN) 448 DO_2OP(veor, 8, uint64_t, DO_EOR) 449 450 #define DO_ADD(N, M) ((N) + (M)) 451 #define DO_SUB(N, M) ((N) - (M)) 452 #define DO_MUL(N, M) ((N) * (M)) 453 454 DO_2OP_U(vadd, DO_ADD) 455 DO_2OP_U(vsub, DO_SUB) 456 DO_2OP_U(vmul, DO_MUL) 457 458 DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL) 459 DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL) 460 DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL) 461 DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL) 462 DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL) 463 DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL) 464 465 DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL) 466 DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL) 467 DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL) 468 DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL) 469 DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL) 470 DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) 471 472 /* 473 * Because the computation type is at least twice as large as required, 474 * these work for both signed and unsigned source types. 475 */ 476 static inline uint8_t do_mulh_b(int32_t n, int32_t m) 477 { 478 return (n * m) >> 8; 479 } 480 481 static inline uint16_t do_mulh_h(int32_t n, int32_t m) 482 { 483 return (n * m) >> 16; 484 } 485 486 static inline uint32_t do_mulh_w(int64_t n, int64_t m) 487 { 488 return (n * m) >> 32; 489 } 490 491 static inline uint8_t do_rmulh_b(int32_t n, int32_t m) 492 { 493 return (n * m + (1U << 7)) >> 8; 494 } 495 496 static inline uint16_t do_rmulh_h(int32_t n, int32_t m) 497 { 498 return (n * m + (1U << 15)) >> 16; 499 } 500 501 static inline uint32_t do_rmulh_w(int64_t n, int64_t m) 502 { 503 return (n * m + (1U << 31)) >> 32; 504 } 505 506 DO_2OP(vmulhsb, 1, int8_t, do_mulh_b) 507 DO_2OP(vmulhsh, 2, int16_t, do_mulh_h) 508 DO_2OP(vmulhsw, 4, int32_t, do_mulh_w) 509 DO_2OP(vmulhub, 1, uint8_t, do_mulh_b) 510 DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h) 511 DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w) 512 513 DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b) 514 DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h) 515 DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w) 516 DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b) 517 DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h) 518 DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w) 519 520 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 521 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 522 523 DO_2OP_S(vmaxs, DO_MAX) 524 DO_2OP_U(vmaxu, DO_MAX) 525 DO_2OP_S(vmins, DO_MIN) 526 DO_2OP_U(vminu, DO_MIN) 527 528 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) 529 530 DO_2OP_S(vabds, DO_ABD) 531 DO_2OP_U(vabdu, DO_ABD) 532 533 static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m) 534 { 535 return ((uint64_t)n + m) >> 1; 536 } 537 538 static inline int32_t do_vhadd_s(int32_t n, int32_t m) 539 { 540 return ((int64_t)n + m) >> 1; 541 } 542 543 static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m) 544 { 545 return ((uint64_t)n - m) >> 1; 546 } 547 548 static inline int32_t do_vhsub_s(int32_t n, int32_t m) 549 { 550 return ((int64_t)n - m) >> 1; 551 } 552 553 DO_2OP_S(vhadds, do_vhadd_s) 554 DO_2OP_U(vhaddu, do_vhadd_u) 555 DO_2OP_S(vhsubs, do_vhsub_s) 556 DO_2OP_U(vhsubu, do_vhsub_u) 557 558 #define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) 559 #define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) 560 #define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) 561 #define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) 562 563 DO_2OP_S(vshls, DO_VSHLS) 564 DO_2OP_U(vshlu, DO_VSHLU) 565 DO_2OP_S(vrshls, DO_VRSHLS) 566 DO_2OP_U(vrshlu, DO_VRSHLU) 567 568 #define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1) 569 #define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1) 570 571 DO_2OP_S(vrhadds, DO_RHADD_S) 572 DO_2OP_U(vrhaddu, DO_RHADD_U) 573 574 static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m, 575 uint32_t inv, uint32_t carry_in, bool update_flags) 576 { 577 uint16_t mask = mve_element_mask(env); 578 unsigned e; 579 580 /* If any additions trigger, we will update flags. */ 581 if (mask & 0x1111) { 582 update_flags = true; 583 } 584 585 for (e = 0; e < 16 / 4; e++, mask >>= 4) { 586 uint64_t r = carry_in; 587 r += n[H4(e)]; 588 r += m[H4(e)] ^ inv; 589 if (mask & 1) { 590 carry_in = r >> 32; 591 } 592 mergemask(&d[H4(e)], r, mask); 593 } 594 595 if (update_flags) { 596 /* Store C, clear NZV. */ 597 env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK; 598 env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C; 599 } 600 mve_advance_vpt(env); 601 } 602 603 void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm) 604 { 605 bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; 606 do_vadc(env, vd, vn, vm, 0, carry_in, false); 607 } 608 609 void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm) 610 { 611 bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C; 612 do_vadc(env, vd, vn, vm, -1, carry_in, false); 613 } 614 615 616 void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm) 617 { 618 do_vadc(env, vd, vn, vm, 0, 0, true); 619 } 620 621 void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm) 622 { 623 do_vadc(env, vd, vn, vm, -1, 1, true); 624 } 625 626 #define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1) \ 627 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \ 628 { \ 629 TYPE *d = vd, *n = vn, *m = vm; \ 630 uint16_t mask = mve_element_mask(env); \ 631 unsigned e; \ 632 TYPE r[16 / ESIZE]; \ 633 /* Calculate all results first to avoid overwriting inputs */ \ 634 for (e = 0; e < 16 / ESIZE; e++) { \ 635 if (!(e & 1)) { \ 636 r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \ 637 } else { \ 638 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \ 639 } \ 640 } \ 641 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 642 mergemask(&d[H##ESIZE(e)], r[e], mask); \ 643 } \ 644 mve_advance_vpt(env); \ 645 } 646 647 #define DO_VCADD_ALL(OP, FN0, FN1) \ 648 DO_VCADD(OP##b, 1, int8_t, FN0, FN1) \ 649 DO_VCADD(OP##h, 2, int16_t, FN0, FN1) \ 650 DO_VCADD(OP##w, 4, int32_t, FN0, FN1) 651 652 DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD) 653 DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB) 654 DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s) 655 DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s) 656 657 static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s) 658 { 659 if (val > max) { 660 *s = true; 661 return max; 662 } else if (val < min) { 663 *s = true; 664 return min; 665 } 666 return val; 667 } 668 669 #define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s) 670 #define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s) 671 #define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s) 672 673 #define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s) 674 #define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s) 675 #define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s) 676 677 #define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s) 678 #define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s) 679 #define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s) 680 681 #define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s) 682 #define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s) 683 #define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s) 684 685 /* 686 * For QDMULH and QRDMULH we simplify "double and shift by esize" into 687 * "shift by esize-1", adjusting the QRDMULH rounding constant to match. 688 */ 689 #define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \ 690 INT8_MIN, INT8_MAX, s) 691 #define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \ 692 INT16_MIN, INT16_MAX, s) 693 #define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \ 694 INT32_MIN, INT32_MAX, s) 695 696 #define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \ 697 INT8_MIN, INT8_MAX, s) 698 #define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \ 699 INT16_MIN, INT16_MAX, s) 700 #define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \ 701 INT32_MIN, INT32_MAX, s) 702 703 DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B) 704 DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H) 705 DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W) 706 707 DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B) 708 DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H) 709 DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W) 710 711 DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B) 712 DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H) 713 DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W) 714 DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B) 715 DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H) 716 DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W) 717 718 DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B) 719 DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H) 720 DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W) 721 DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B) 722 DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H) 723 DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W) 724 725 /* 726 * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs() 727 * and friends wanting a uint32_t* sat and our needing a bool*. 728 */ 729 #define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \ 730 ({ \ 731 uint32_t su32 = 0; \ 732 typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \ 733 if (su32) { \ 734 *satp = true; \ 735 } \ 736 r; \ 737 }) 738 739 #define DO_SQSHL_OP(N, M, satp) \ 740 WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp) 741 #define DO_UQSHL_OP(N, M, satp) \ 742 WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp) 743 #define DO_SQRSHL_OP(N, M, satp) \ 744 WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp) 745 #define DO_UQRSHL_OP(N, M, satp) \ 746 WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp) 747 #define DO_SUQSHL_OP(N, M, satp) \ 748 WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp) 749 750 DO_2OP_SAT_S(vqshls, DO_SQSHL_OP) 751 DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP) 752 DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP) 753 DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP) 754 755 /* 756 * Multiply add dual returning high half 757 * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of 758 * whether to add the rounding constant, and the pointer to the 759 * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant", 760 * saturate to twice the input size and return the high half; or 761 * (A * B - C * D) etc for VQDMLSDH. 762 */ 763 #define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN) \ 764 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ 765 void *vm) \ 766 { \ 767 TYPE *d = vd, *n = vn, *m = vm; \ 768 uint16_t mask = mve_element_mask(env); \ 769 unsigned e; \ 770 bool qc = false; \ 771 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 772 bool sat = false; \ 773 if ((e & 1) == XCHG) { \ 774 TYPE r = FN(n[H##ESIZE(e)], \ 775 m[H##ESIZE(e - XCHG)], \ 776 n[H##ESIZE(e + (1 - 2 * XCHG))], \ 777 m[H##ESIZE(e + (1 - XCHG))], \ 778 ROUND, &sat); \ 779 mergemask(&d[H##ESIZE(e)], r, mask); \ 780 qc |= sat & mask & 1; \ 781 } \ 782 } \ 783 if (qc) { \ 784 env->vfp.qc[0] = qc; \ 785 } \ 786 mve_advance_vpt(env); \ 787 } 788 789 static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d, 790 int round, bool *sat) 791 { 792 int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7); 793 return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; 794 } 795 796 static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d, 797 int round, bool *sat) 798 { 799 int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15); 800 return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; 801 } 802 803 static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d, 804 int round, bool *sat) 805 { 806 int64_t m1 = (int64_t)a * b; 807 int64_t m2 = (int64_t)c * d; 808 int64_t r; 809 /* 810 * Architecturally we should do the entire add, double, round 811 * and then check for saturation. We do three saturating adds, 812 * but we need to be careful about the order. If the first 813 * m1 + m2 saturates then it's impossible for the *2+rc to 814 * bring it back into the non-saturated range. However, if 815 * m1 + m2 is negative then it's possible that doing the doubling 816 * would take the intermediate result below INT64_MAX and the 817 * addition of the rounding constant then brings it back in range. 818 * So we add half the rounding constant before doubling rather 819 * than adding the rounding constant after the doubling. 820 */ 821 if (sadd64_overflow(m1, m2, &r) || 822 sadd64_overflow(r, (round << 30), &r) || 823 sadd64_overflow(r, r, &r)) { 824 *sat = true; 825 return r < 0 ? INT32_MAX : INT32_MIN; 826 } 827 return r >> 32; 828 } 829 830 static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d, 831 int round, bool *sat) 832 { 833 int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7); 834 return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8; 835 } 836 837 static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d, 838 int round, bool *sat) 839 { 840 int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15); 841 return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16; 842 } 843 844 static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d, 845 int round, bool *sat) 846 { 847 int64_t m1 = (int64_t)a * b; 848 int64_t m2 = (int64_t)c * d; 849 int64_t r; 850 /* The same ordering issue as in do_vqdmladh_w applies here too */ 851 if (ssub64_overflow(m1, m2, &r) || 852 sadd64_overflow(r, (round << 30), &r) || 853 sadd64_overflow(r, r, &r)) { 854 *sat = true; 855 return r < 0 ? INT32_MAX : INT32_MIN; 856 } 857 return r >> 32; 858 } 859 860 DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b) 861 DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h) 862 DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w) 863 DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b) 864 DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h) 865 DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w) 866 867 DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b) 868 DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h) 869 DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w) 870 DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b) 871 DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h) 872 DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w) 873 874 DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b) 875 DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h) 876 DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w) 877 DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b) 878 DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h) 879 DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w) 880 881 DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b) 882 DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h) 883 DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w) 884 DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b) 885 DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h) 886 DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w) 887 888 #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \ 889 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ 890 uint32_t rm) \ 891 { \ 892 TYPE *d = vd, *n = vn; \ 893 TYPE m = rm; \ 894 uint16_t mask = mve_element_mask(env); \ 895 unsigned e; \ 896 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 897 mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \ 898 } \ 899 mve_advance_vpt(env); \ 900 } 901 902 #define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN) \ 903 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ 904 uint32_t rm) \ 905 { \ 906 TYPE *d = vd, *n = vn; \ 907 TYPE m = rm; \ 908 uint16_t mask = mve_element_mask(env); \ 909 unsigned e; \ 910 bool qc = false; \ 911 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 912 bool sat = false; \ 913 mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \ 914 mask); \ 915 qc |= sat & mask & 1; \ 916 } \ 917 if (qc) { \ 918 env->vfp.qc[0] = qc; \ 919 } \ 920 mve_advance_vpt(env); \ 921 } 922 923 /* provide unsigned 2-op scalar helpers for all sizes */ 924 #define DO_2OP_SCALAR_U(OP, FN) \ 925 DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \ 926 DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \ 927 DO_2OP_SCALAR(OP##w, 4, uint32_t, FN) 928 #define DO_2OP_SCALAR_S(OP, FN) \ 929 DO_2OP_SCALAR(OP##b, 1, int8_t, FN) \ 930 DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \ 931 DO_2OP_SCALAR(OP##w, 4, int32_t, FN) 932 933 DO_2OP_SCALAR_U(vadd_scalar, DO_ADD) 934 DO_2OP_SCALAR_U(vsub_scalar, DO_SUB) 935 DO_2OP_SCALAR_U(vmul_scalar, DO_MUL) 936 DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s) 937 DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u) 938 DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s) 939 DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u) 940 941 DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B) 942 DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H) 943 DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W) 944 DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B) 945 DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H) 946 DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W) 947 948 DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B) 949 DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H) 950 DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W) 951 DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B) 952 DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H) 953 DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W) 954 955 DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B) 956 DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H) 957 DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W) 958 DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B) 959 DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H) 960 DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W) 961 962 /* 963 * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the 964 * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type. 965 * SATMASK specifies which bits of the predicate mask matter for determining 966 * whether to propagate a saturation indication into FPSCR.QC -- for 967 * the 16x16->32 case we must check only the bit corresponding to the T or B 968 * half that we used, but for the 32x32->64 case we propagate if the mask 969 * bit is set for either half. 970 */ 971 #define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ 972 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ 973 uint32_t rm) \ 974 { \ 975 LTYPE *d = vd; \ 976 TYPE *n = vn; \ 977 TYPE m = rm; \ 978 uint16_t mask = mve_element_mask(env); \ 979 unsigned le; \ 980 bool qc = false; \ 981 for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ 982 bool sat = false; \ 983 LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \ 984 mergemask(&d[H##LESIZE(le)], r, mask); \ 985 qc |= sat && (mask & SATMASK); \ 986 } \ 987 if (qc) { \ 988 env->vfp.qc[0] = qc; \ 989 } \ 990 mve_advance_vpt(env); \ 991 } 992 993 static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat) 994 { 995 int64_t r = ((int64_t)n * m) * 2; 996 return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat); 997 } 998 999 static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat) 1000 { 1001 /* The multiply can't overflow, but the doubling might */ 1002 int64_t r = (int64_t)n * m; 1003 if (r > INT64_MAX / 2) { 1004 *sat = true; 1005 return INT64_MAX; 1006 } else if (r < INT64_MIN / 2) { 1007 *sat = true; 1008 return INT64_MIN; 1009 } else { 1010 return r * 2; 1011 } 1012 } 1013 1014 #define SATMASK16B 1 1015 #define SATMASK16T (1 << 2) 1016 #define SATMASK32 ((1 << 4) | 1) 1017 1018 DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \ 1019 do_qdmullh, SATMASK16B) 1020 DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \ 1021 do_qdmullw, SATMASK32) 1022 DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \ 1023 do_qdmullh, SATMASK16T) 1024 DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \ 1025 do_qdmullw, SATMASK32) 1026 1027 /* 1028 * Long saturating ops 1029 */ 1030 #define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \ 1031 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \ 1032 void *vm) \ 1033 { \ 1034 LTYPE *d = vd; \ 1035 TYPE *n = vn, *m = vm; \ 1036 uint16_t mask = mve_element_mask(env); \ 1037 unsigned le; \ 1038 bool qc = false; \ 1039 for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ 1040 bool sat = false; \ 1041 LTYPE op1 = n[H##ESIZE(le * 2 + TOP)]; \ 1042 LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \ 1043 mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask); \ 1044 qc |= sat && (mask & SATMASK); \ 1045 } \ 1046 if (qc) { \ 1047 env->vfp.qc[0] = qc; \ 1048 } \ 1049 mve_advance_vpt(env); \ 1050 } 1051 1052 DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B) 1053 DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) 1054 DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T) 1055 DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32) 1056 1057 static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) 1058 { 1059 m &= 0xff; 1060 if (m == 0) { 1061 return 0; 1062 } 1063 n = revbit8(n); 1064 if (m < 8) { 1065 n >>= 8 - m; 1066 } 1067 return n; 1068 } 1069 1070 static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m) 1071 { 1072 m &= 0xff; 1073 if (m == 0) { 1074 return 0; 1075 } 1076 n = revbit16(n); 1077 if (m < 16) { 1078 n >>= 16 - m; 1079 } 1080 return n; 1081 } 1082 1083 static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m) 1084 { 1085 m &= 0xff; 1086 if (m == 0) { 1087 return 0; 1088 } 1089 n = revbit32(n); 1090 if (m < 32) { 1091 n >>= 32 - m; 1092 } 1093 return n; 1094 } 1095 1096 DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb) 1097 DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh) 1098 DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw) 1099 1100 /* 1101 * Multiply add long dual accumulate ops. 1102 */ 1103 #define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \ 1104 uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ 1105 void *vm, uint64_t a) \ 1106 { \ 1107 uint16_t mask = mve_element_mask(env); \ 1108 unsigned e; \ 1109 TYPE *n = vn, *m = vm; \ 1110 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 1111 if (mask & 1) { \ 1112 if (e & 1) { \ 1113 a ODDACC \ 1114 (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \ 1115 } else { \ 1116 a EVENACC \ 1117 (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \ 1118 } \ 1119 } \ 1120 } \ 1121 mve_advance_vpt(env); \ 1122 return a; \ 1123 } 1124 1125 DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=) 1126 DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=) 1127 DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=) 1128 DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=) 1129 1130 DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=) 1131 DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=) 1132 1133 DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=) 1134 DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=) 1135 DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=) 1136 DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=) 1137 1138 /* 1139 * Rounding multiply add long dual accumulate high. In the pseudocode 1140 * this is implemented with a 72-bit internal accumulator value of which 1141 * the top 64 bits are returned. We optimize this to avoid having to 1142 * use 128-bit arithmetic -- we can do this because the 74-bit accumulator 1143 * is squashed back into 64-bits after each beat. 1144 */ 1145 #define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB) \ 1146 uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \ 1147 void *vm, uint64_t a) \ 1148 { \ 1149 uint16_t mask = mve_element_mask(env); \ 1150 unsigned e; \ 1151 TYPE *n = vn, *m = vm; \ 1152 for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ 1153 if (mask & 1) { \ 1154 LTYPE mul; \ 1155 if (e & 1) { \ 1156 mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)]; \ 1157 if (SUB) { \ 1158 mul = -mul; \ 1159 } \ 1160 } else { \ 1161 mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)]; \ 1162 } \ 1163 mul = (mul >> 8) + ((mul >> 7) & 1); \ 1164 a += mul; \ 1165 } \ 1166 } \ 1167 mve_advance_vpt(env); \ 1168 return a; \ 1169 } 1170 1171 DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false) 1172 DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false) 1173 1174 DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false) 1175 1176 DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true) 1177 DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true) 1178 1179 /* Vector add across vector */ 1180 #define DO_VADDV(OP, ESIZE, TYPE) \ 1181 uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ 1182 uint32_t ra) \ 1183 { \ 1184 uint16_t mask = mve_element_mask(env); \ 1185 unsigned e; \ 1186 TYPE *m = vm; \ 1187 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 1188 if (mask & 1) { \ 1189 ra += m[H##ESIZE(e)]; \ 1190 } \ 1191 } \ 1192 mve_advance_vpt(env); \ 1193 return ra; \ 1194 } \ 1195 1196 DO_VADDV(vaddvsb, 1, int8_t) 1197 DO_VADDV(vaddvsh, 2, int16_t) 1198 DO_VADDV(vaddvsw, 4, int32_t) 1199 DO_VADDV(vaddvub, 1, uint8_t) 1200 DO_VADDV(vaddvuh, 2, uint16_t) 1201 DO_VADDV(vaddvuw, 4, uint32_t) 1202 1203 #define DO_VADDLV(OP, TYPE, LTYPE) \ 1204 uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \ 1205 uint64_t ra) \ 1206 { \ 1207 uint16_t mask = mve_element_mask(env); \ 1208 unsigned e; \ 1209 TYPE *m = vm; \ 1210 for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ 1211 if (mask & 1) { \ 1212 ra += (LTYPE)m[H4(e)]; \ 1213 } \ 1214 } \ 1215 mve_advance_vpt(env); \ 1216 return ra; \ 1217 } \ 1218 1219 DO_VADDLV(vaddlv_s, int32_t, int64_t) 1220 DO_VADDLV(vaddlv_u, uint32_t, uint64_t) 1221 1222 /* Shifts by immediate */ 1223 #define DO_2SHIFT(OP, ESIZE, TYPE, FN) \ 1224 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ 1225 void *vm, uint32_t shift) \ 1226 { \ 1227 TYPE *d = vd, *m = vm; \ 1228 uint16_t mask = mve_element_mask(env); \ 1229 unsigned e; \ 1230 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 1231 mergemask(&d[H##ESIZE(e)], \ 1232 FN(m[H##ESIZE(e)], shift), mask); \ 1233 } \ 1234 mve_advance_vpt(env); \ 1235 } 1236 1237 #define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN) \ 1238 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ 1239 void *vm, uint32_t shift) \ 1240 { \ 1241 TYPE *d = vd, *m = vm; \ 1242 uint16_t mask = mve_element_mask(env); \ 1243 unsigned e; \ 1244 bool qc = false; \ 1245 for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ 1246 bool sat = false; \ 1247 mergemask(&d[H##ESIZE(e)], \ 1248 FN(m[H##ESIZE(e)], shift, &sat), mask); \ 1249 qc |= sat & mask & 1; \ 1250 } \ 1251 if (qc) { \ 1252 env->vfp.qc[0] = qc; \ 1253 } \ 1254 mve_advance_vpt(env); \ 1255 } 1256 1257 /* provide unsigned 2-op shift helpers for all sizes */ 1258 #define DO_2SHIFT_U(OP, FN) \ 1259 DO_2SHIFT(OP##b, 1, uint8_t, FN) \ 1260 DO_2SHIFT(OP##h, 2, uint16_t, FN) \ 1261 DO_2SHIFT(OP##w, 4, uint32_t, FN) 1262 #define DO_2SHIFT_S(OP, FN) \ 1263 DO_2SHIFT(OP##b, 1, int8_t, FN) \ 1264 DO_2SHIFT(OP##h, 2, int16_t, FN) \ 1265 DO_2SHIFT(OP##w, 4, int32_t, FN) 1266 1267 #define DO_2SHIFT_SAT_U(OP, FN) \ 1268 DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN) \ 1269 DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN) \ 1270 DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN) 1271 #define DO_2SHIFT_SAT_S(OP, FN) \ 1272 DO_2SHIFT_SAT(OP##b, 1, int8_t, FN) \ 1273 DO_2SHIFT_SAT(OP##h, 2, int16_t, FN) \ 1274 DO_2SHIFT_SAT(OP##w, 4, int32_t, FN) 1275 1276 DO_2SHIFT_U(vshli_u, DO_VSHLU) 1277 DO_2SHIFT_S(vshli_s, DO_VSHLS) 1278 DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP) 1279 DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP) 1280 DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP) 1281 DO_2SHIFT_U(vrshli_u, DO_VRSHLU) 1282 DO_2SHIFT_S(vrshli_s, DO_VRSHLS) 1283 1284 /* Shift-and-insert; we always work with 64 bits at a time */ 1285 #define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN) \ 1286 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ 1287 void *vm, uint32_t shift) \ 1288 { \ 1289 uint64_t *d = vd, *m = vm; \ 1290 uint16_t mask; \ 1291 uint64_t shiftmask; \ 1292 unsigned e; \ 1293 if (shift == ESIZE * 8) { \ 1294 /* \ 1295 * Only VSRI can shift by <dt>; it should mean "don't \ 1296 * update the destination". The generic logic can't handle \ 1297 * this because it would try to shift by an out-of-range \ 1298 * amount, so special case it here. \ 1299 */ \ 1300 goto done; \ 1301 } \ 1302 assert(shift < ESIZE * 8); \ 1303 mask = mve_element_mask(env); \ 1304 /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */ \ 1305 shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift)); \ 1306 for (e = 0; e < 16 / 8; e++, mask >>= 8) { \ 1307 uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) | \ 1308 (d[H8(e)] & ~shiftmask); \ 1309 mergemask(&d[H8(e)], r, mask); \ 1310 } \ 1311 done: \ 1312 mve_advance_vpt(env); \ 1313 } 1314 1315 #define DO_SHL(N, SHIFT) ((N) << (SHIFT)) 1316 #define DO_SHR(N, SHIFT) ((N) >> (SHIFT)) 1317 #define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT)) 1318 #define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT)) 1319 1320 DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK) 1321 DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK) 1322 DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK) 1323 DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK) 1324 DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK) 1325 DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK) 1326 1327 /* 1328 * Long shifts taking half-sized inputs from top or bottom of the input 1329 * vector and producing a double-width result. ESIZE, TYPE are for 1330 * the input, and LESIZE, LTYPE for the output. 1331 * Unlike the normal shift helpers, we do not handle negative shift counts, 1332 * because the long shift is strictly left-only. 1333 */ 1334 #define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \ 1335 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ 1336 void *vm, uint32_t shift) \ 1337 { \ 1338 LTYPE *d = vd; \ 1339 TYPE *m = vm; \ 1340 uint16_t mask = mve_element_mask(env); \ 1341 unsigned le; \ 1342 assert(shift <= 16); \ 1343 for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ 1344 LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift; \ 1345 mergemask(&d[H##LESIZE(le)], r, mask); \ 1346 } \ 1347 mve_advance_vpt(env); \ 1348 } 1349 1350 #define DO_VSHLL_ALL(OP, TOP) \ 1351 DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t) \ 1352 DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t) \ 1353 DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t) \ 1354 DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t) \ 1355 1356 DO_VSHLL_ALL(vshllb, false) 1357 DO_VSHLL_ALL(vshllt, true) 1358 1359 /* 1360 * Narrowing right shifts, taking a double sized input, shifting it 1361 * and putting the result in either the top or bottom half of the output. 1362 * ESIZE, TYPE are the output, and LESIZE, LTYPE the input. 1363 */ 1364 #define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ 1365 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ 1366 void *vm, uint32_t shift) \ 1367 { \ 1368 LTYPE *m = vm; \ 1369 TYPE *d = vd; \ 1370 uint16_t mask = mve_element_mask(env); \ 1371 unsigned le; \ 1372 mask >>= ESIZE * TOP; \ 1373 for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ 1374 TYPE r = FN(m[H##LESIZE(le)], shift); \ 1375 mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \ 1376 } \ 1377 mve_advance_vpt(env); \ 1378 } 1379 1380 #define DO_VSHRN_ALL(OP, FN) \ 1381 DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN) \ 1382 DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN) \ 1383 DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN) \ 1384 DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN) 1385 1386 static inline uint64_t do_urshr(uint64_t x, unsigned sh) 1387 { 1388 if (likely(sh < 64)) { 1389 return (x >> sh) + ((x >> (sh - 1)) & 1); 1390 } else if (sh == 64) { 1391 return x >> 63; 1392 } else { 1393 return 0; 1394 } 1395 } 1396 1397 static inline int64_t do_srshr(int64_t x, unsigned sh) 1398 { 1399 if (likely(sh < 64)) { 1400 return (x >> sh) + ((x >> (sh - 1)) & 1); 1401 } else { 1402 /* Rounding the sign bit always produces 0. */ 1403 return 0; 1404 } 1405 } 1406 1407 DO_VSHRN_ALL(vshrn, DO_SHR) 1408 DO_VSHRN_ALL(vrshrn, do_urshr) 1409 1410 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max, 1411 bool *satp) 1412 { 1413 if (val > max) { 1414 *satp = true; 1415 return max; 1416 } else if (val < min) { 1417 *satp = true; 1418 return min; 1419 } else { 1420 return val; 1421 } 1422 } 1423 1424 /* Saturating narrowing right shifts */ 1425 #define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \ 1426 void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \ 1427 void *vm, uint32_t shift) \ 1428 { \ 1429 LTYPE *m = vm; \ 1430 TYPE *d = vd; \ 1431 uint16_t mask = mve_element_mask(env); \ 1432 bool qc = false; \ 1433 unsigned le; \ 1434 mask >>= ESIZE * TOP; \ 1435 for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \ 1436 bool sat = false; \ 1437 TYPE r = FN(m[H##LESIZE(le)], shift, &sat); \ 1438 mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \ 1439 qc |= sat & mask & 1; \ 1440 } \ 1441 if (qc) { \ 1442 env->vfp.qc[0] = qc; \ 1443 } \ 1444 mve_advance_vpt(env); \ 1445 } 1446 1447 #define DO_VSHRN_SAT_UB(BOP, TOP, FN) \ 1448 DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \ 1449 DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN) 1450 1451 #define DO_VSHRN_SAT_UH(BOP, TOP, FN) \ 1452 DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \ 1453 DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN) 1454 1455 #define DO_VSHRN_SAT_SB(BOP, TOP, FN) \ 1456 DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \ 1457 DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN) 1458 1459 #define DO_VSHRN_SAT_SH(BOP, TOP, FN) \ 1460 DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \ 1461 DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN) 1462 1463 #define DO_SHRN_SB(N, M, SATP) \ 1464 do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP) 1465 #define DO_SHRN_UB(N, M, SATP) \ 1466 do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP) 1467 #define DO_SHRUN_B(N, M, SATP) \ 1468 do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP) 1469 1470 #define DO_SHRN_SH(N, M, SATP) \ 1471 do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP) 1472 #define DO_SHRN_UH(N, M, SATP) \ 1473 do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP) 1474 #define DO_SHRUN_H(N, M, SATP) \ 1475 do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP) 1476 1477 #define DO_RSHRN_SB(N, M, SATP) \ 1478 do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP) 1479 #define DO_RSHRN_UB(N, M, SATP) \ 1480 do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP) 1481 #define DO_RSHRUN_B(N, M, SATP) \ 1482 do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP) 1483 1484 #define DO_RSHRN_SH(N, M, SATP) \ 1485 do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP) 1486 #define DO_RSHRN_UH(N, M, SATP) \ 1487 do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP) 1488 #define DO_RSHRUN_H(N, M, SATP) \ 1489 do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP) 1490 1491 DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB) 1492 DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH) 1493 DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB) 1494 DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH) 1495 DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B) 1496 DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H) 1497 1498 DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB) 1499 DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH) 1500 DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB) 1501 DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH) 1502 DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B) 1503 DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H) 1504 1505 uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm, 1506 uint32_t shift) 1507 { 1508 uint32_t *d = vd; 1509 uint16_t mask = mve_element_mask(env); 1510 unsigned e; 1511 uint32_t r; 1512 1513 /* 1514 * For each 32-bit element, we shift it left, bringing in the 1515 * low 'shift' bits of rdm at the bottom. Bits shifted out at 1516 * the top become the new rdm, if the predicate mask permits. 1517 * The final rdm value is returned to update the register. 1518 * shift == 0 here means "shift by 32 bits". 1519 */ 1520 if (shift == 0) { 1521 for (e = 0; e < 16 / 4; e++, mask >>= 4) { 1522 r = rdm; 1523 if (mask & 1) { 1524 rdm = d[H4(e)]; 1525 } 1526 mergemask(&d[H4(e)], r, mask); 1527 } 1528 } else { 1529 uint32_t shiftmask = MAKE_64BIT_MASK(0, shift); 1530 1531 for (e = 0; e < 16 / 4; e++, mask >>= 4) { 1532 r = (d[H4(e)] << shift) | (rdm & shiftmask); 1533 if (mask & 1) { 1534 rdm = d[H4(e)] >> (32 - shift); 1535 } 1536 mergemask(&d[H4(e)], r, mask); 1537 } 1538 } 1539 mve_advance_vpt(env); 1540 return rdm; 1541 } 1542 1543 uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift) 1544 { 1545 return do_sqrshl_d(n, -(int8_t)shift, false, NULL); 1546 } 1547 1548 uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift) 1549 { 1550 return do_uqrshl_d(n, (int8_t)shift, false, NULL); 1551 } 1552 1553 uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift) 1554 { 1555 return do_sqrshl_d(n, (int8_t)shift, false, &env->QF); 1556 } 1557 1558 uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift) 1559 { 1560 return do_uqrshl_d(n, (int8_t)shift, false, &env->QF); 1561 } 1562 1563 uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift) 1564 { 1565 return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF); 1566 } 1567 1568 uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift) 1569 { 1570 return do_uqrshl_d(n, (int8_t)shift, true, &env->QF); 1571 } 1572 1573 /* Operate on 64-bit values, but saturate at 48 bits */ 1574 static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift, 1575 bool round, uint32_t *sat) 1576 { 1577 int64_t val, extval; 1578 1579 if (shift <= -48) { 1580 /* Rounding the sign bit always produces 0. */ 1581 if (round) { 1582 return 0; 1583 } 1584 return src >> 63; 1585 } else if (shift < 0) { 1586 if (round) { 1587 src >>= -shift - 1; 1588 val = (src >> 1) + (src & 1); 1589 } else { 1590 val = src >> -shift; 1591 } 1592 extval = sextract64(val, 0, 48); 1593 if (!sat || val == extval) { 1594 return extval; 1595 } 1596 } else if (shift < 48) { 1597 int64_t extval = sextract64(src << shift, 0, 48); 1598 if (!sat || src == (extval >> shift)) { 1599 return extval; 1600 } 1601 } else if (!sat || src == 0) { 1602 return 0; 1603 } 1604 1605 *sat = 1; 1606 return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17); 1607 } 1608 1609 /* Operate on 64-bit values, but saturate at 48 bits */ 1610 static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift, 1611 bool round, uint32_t *sat) 1612 { 1613 uint64_t val, extval; 1614 1615 if (shift <= -(48 + round)) { 1616 return 0; 1617 } else if (shift < 0) { 1618 if (round) { 1619 val = src >> (-shift - 1); 1620 val = (val >> 1) + (val & 1); 1621 } else { 1622 val = src >> -shift; 1623 } 1624 extval = extract64(val, 0, 48); 1625 if (!sat || val == extval) { 1626 return extval; 1627 } 1628 } else if (shift < 48) { 1629 uint64_t extval = extract64(src << shift, 0, 48); 1630 if (!sat || src == (extval >> shift)) { 1631 return extval; 1632 } 1633 } else if (!sat || src == 0) { 1634 return 0; 1635 } 1636 1637 *sat = 1; 1638 return MAKE_64BIT_MASK(0, 48); 1639 } 1640 1641 uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift) 1642 { 1643 return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF); 1644 } 1645 1646 uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift) 1647 { 1648 return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF); 1649 } 1650 1651 uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift) 1652 { 1653 return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF); 1654 } 1655 1656 uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift) 1657 { 1658 return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF); 1659 } 1660 1661 uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift) 1662 { 1663 return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF); 1664 } 1665 1666 uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift) 1667 { 1668 return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF); 1669 } 1670