Lines Matching +full:- +full:m
2 * M-profile MVE Operations
24 #include "exec/helper-proto.h"
25 #include "accel/tcg/cpu-ldst.h"
39 if ((env->condexec_bits & 0xf) != 0) { in mve_eci_mask()
43 eci = env->condexec_bits >> 4; in mve_eci_mask()
66 * (3) low-overhead-branch tail predication will mask out part in mve_element_mask()
70 * We combine all these into a 16-bit result with the same semantics in mve_element_mask()
72 * 8-bit vector ops will look at all bits of the result; in mve_element_mask()
73 * 16-bit ops will look at bits 0, 2, 4, ...; in mve_element_mask()
74 * 32-bit ops will look at bits 0, 4, 8 and 12. in mve_element_mask()
76 * the 4-bit slice of the mask corresponding to a single beat. in mve_element_mask()
78 uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0); in mve_element_mask()
80 if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) { in mve_element_mask()
83 if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) { in mve_element_mask()
87 if (env->v7m.ltpsize < 4 && in mve_element_mask()
88 env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) { in mve_element_mask()
95 int masklen = env->regs[14] << env->v7m.ltpsize; in mve_element_mask()
112 uint32_t vpr = env->v7m.vpr; in mve_advance_vpt()
117 if ((env->condexec_bits & 0xf) == 0) { in mve_advance_vpt()
118 env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ? in mve_advance_vpt()
147 env->v7m.vpr = vpr; in mve_advance_vpt()
222 OFFTYPE *m = vm; \
231 addr = ADDRFN(base, m[H##ESIZE(e)]); \
235 m[H##ESIZE(e)] = addr; \
247 TYPE *m = vm; \
256 addr = ADDRFN(base, m[H##ESIZE(e)]); \
261 m[H##ESIZE(e)] = addr; \
268 * 64-bit accesses are slightly different: they are done as two 32-bit
270 * and with a single 32-bit offset in the first of the two Qm elements.
273 * stored in the even-beat element.
280 uint32_t *m = vm; \
289 addr = ADDRFN(base, m[H4(e & ~1)]); \
293 m[H4(e & ~1)] = addr - 4; \
304 uint32_t *m = vm; \
313 addr = ADDRFN(base, m[H4(e & ~1)]); \
319 m[H4(e & ~1)] = addr - 4; \
380 * one 32-bit memory access per beat. in DO_VLDR64_SG()
559 for (e = 3; e >= 0; e--) { \
645 for (e = 3; e >= 0; e--) { \
670 for (e = 1; e >= 0; e--) { \
709 * The mergemask(D, R, M) macro performs the operation "*D = R" but
710 * storing only the bytes which correspond to 1 bits in M,
760 #define mergemask(D, R, M) \ argument
769 int64_t *: mergemask_sq)(D, R, M)
775 * into the 32-bit value, so we only need to write the 32-bit in HELPER()
790 TYPE *d = vd, *m = vm; \
794 mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \
799 #define DO_CLS_B(N) (clrsb32(N) - 24)
800 #define DO_CLS_H(N) (clrsb32(N) - 16)
806 #define DO_CLZ_B(N) (clz32(N) - 24)
807 #define DO_CLZ_H(N) (clz32(N) - 16)
824 #define DO_ABS(N) ((N) < 0 ? -(N) : (N))
836 #define DO_NEG(N) (-(N))
850 * All these insns work at 64-bit widths.
876 TYPE *d = vd, *n = vn, *m = vm; \ in DO_1OP_IMM()
881 FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \ in DO_1OP_IMM()
886 /* provide unsigned 2-op helpers for all sizes */
892 /* provide signed 2-op helpers for all sizes */
899 * "Long" operations where two half-sized inputs (taken from either the
900 * top or the bottom of the input vector) produce a double-width result.
907 TYPE *n = vn, *m = vm; \
912 m[H##ESIZE(le * 2 + TOP)]); \
921 TYPE *d = vd, *n = vn, *m = vm; \
927 TYPE r_ = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \
932 env->vfp.qc[0] = qc; \
937 /* provide unsigned 2-op helpers for all sizes */
943 /* provide signed 2-op helpers for all sizes */
949 #define DO_AND(N, M) ((N) & (M)) argument
950 #define DO_BIC(N, M) ((N) & ~(M)) argument
951 #define DO_ORR(N, M) ((N) | (M)) argument
952 #define DO_ORN(N, M) ((N) | ~(M)) argument
953 #define DO_EOR(N, M) ((N) ^ (M)) argument
961 #define DO_ADD(N, M) ((N) + (M)) argument
962 #define DO_SUB(N, M) ((N) - (M)) argument
963 #define DO_MUL(N, M) ((N) * (M)) argument
996 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
998 return (n * m) >> 8;
1001 static inline uint16_t do_mulh_h(int32_t n, int32_t m) in do_mulh_h() argument
1003 return (n * m) >> 16; in do_mulh_h()
1006 static inline uint32_t do_mulh_w(int64_t n, int64_t m) in do_mulh_w() argument
1008 return (n * m) >> 32; in do_mulh_w()
1011 static inline uint8_t do_rmulh_b(int32_t n, int32_t m) in do_rmulh_b() argument
1013 return (n * m + (1U << 7)) >> 8; in do_rmulh_b()
1016 static inline uint16_t do_rmulh_h(int32_t n, int32_t m) in do_rmulh_h() argument
1018 return (n * m + (1U << 15)) >> 16; in do_rmulh_h()
1021 static inline uint32_t do_rmulh_w(int64_t n, int64_t m) in do_rmulh_w() argument
1023 return (n * m + (1U << 31)) >> 32; in do_rmulh_w()
1040 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) argument
1041 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) argument
1048 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) in DO_2OP_S() argument
1053 static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m) in DO_2OP_S()
1055 return ((uint64_t)n + m) >> 1; in DO_2OP_S()
1058 static inline int32_t do_vhadd_s(int32_t n, int32_t m) in do_vhadd_s() argument
1060 return ((int64_t)n + m) >> 1; in do_vhadd_s()
1063 static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m) in do_vhsub_u() argument
1065 return ((uint64_t)n - m) >> 1; in do_vhsub_u()
1068 static inline int32_t do_vhsub_s(int32_t n, int32_t m) in do_vhsub_s() argument
1070 return ((int64_t)n - m) >> 1; in do_vhsub_s()
1078 #define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) in DO_2OP_S() argument
1079 #define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL) in DO_2OP_S() argument
1080 #define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) in DO_2OP_S() argument
1081 #define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL) in DO_2OP_S() argument
1088 #define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1) in DO_2OP_S() argument
1089 #define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1) in DO_2OP_S() argument
1094 static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m, in DO_2OP_S()
1108 r += m[H4(e)] ^ inv; in DO_2OP_S()
1117 env->vfp.fpsr &= ~FPSR_NZCV_MASK; in DO_2OP_S()
1118 env->vfp.fpsr |= carry_in * FPSR_C; in DO_2OP_S()
1125 bool carry_in = env->vfp.fpsr & FPSR_C; in HELPER()
1131 bool carry_in = env->vfp.fpsr & FPSR_C; in HELPER()
1132 do_vadc(env, vd, vn, vm, -1, carry_in, false); in HELPER()
1143 do_vadc(env, vd, vn, vm, -1, 1, true); in HELPER()
1149 TYPE *d = vd, *n = vn, *m = vm; \
1156 r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \
1158 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \
1189 #define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s) argument
1190 #define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s) argument
1191 #define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s) argument
1193 #define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s) argument
1194 #define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s) argument
1195 #define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s) argument
1197 #define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s) argument
1198 #define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s) argument
1199 #define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s) argument
1201 #define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s) argument
1202 #define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s) argument
1203 #define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s) argument
1207 * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
1209 #define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \ argument
1211 #define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \ argument
1213 #define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \ argument
1216 #define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \ argument
1218 #define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \ argument
1220 #define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \ argument
1249 #define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \ argument
1252 typeof(N) qrshl_ret = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \
1259 #define DO_SQSHL_OP(N, M, satp) \ argument
1260 WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp)
1261 #define DO_UQSHL_OP(N, M, satp) \ argument
1262 WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp)
1263 #define DO_SQRSHL_OP(N, M, satp) \ argument
1264 WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
1265 #define DO_UQRSHL_OP(N, M, satp) \ argument
1266 WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
1267 #define DO_SUQSHL_OP(N, M, satp) \ argument
1268 WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp)
1281 * (A * B - C * D) etc for VQDMLSDH. in DO_2OP_SAT_S()
1287 TYPE *d = vd, *n = vn, *m = vm; \ in DO_2OP_SAT_S()
1295 m[H##ESIZE(e - XCHG)], \ in DO_2OP_SAT_S()
1296 n[H##ESIZE(e + (1 - 2 * XCHG))], \ in DO_2OP_SAT_S()
1297 m[H##ESIZE(e + (1 - XCHG))], \ in DO_2OP_SAT_S()
1304 env->vfp.qc[0] = qc; \ in DO_2OP_SAT_S()
1334 * bring it back into the non-saturated range. However, if in do_vqdmladh_w()
1353 int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7); in do_vqdmlsdh_b()
1360 int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15); in do_vqdmlsdh_h()
1413 TYPE m = rm; \
1417 mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \
1427 TYPE m = rm; \
1433 mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \
1438 env->vfp.qc[0] = qc; \
1443 /* "accumulating" version where FN takes d as well as n and m */
1449 TYPE m = rm; \
1454 FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask); \
1464 TYPE m = rm; \
1471 FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \
1476 env->vfp.qc[0] = qc; \
1481 /* provide unsigned 2-op scalar helpers for all sizes */
1546 * bring it back into the non-saturated range. However, if in do_vqdmlah_w()
1570 #define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S) argument
1571 #define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S) argument
1572 #define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S) argument
1573 #define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S) argument
1574 #define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S) argument
1575 #define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S) argument
1577 #define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S) argument
1578 #define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S) argument
1579 #define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S) argument
1580 #define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S) argument
1581 #define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S) argument
1582 #define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S) argument
1599 #define DO_VMLA(D, N, M) ((N) * (M) + (D)) argument
1604 #define DO_VMLAS(D, N, M) ((N) * (D) + (M)) in DO_2OP_ACC_SCALAR_U() argument
1612 * whether to propagate a saturation indication into FPSCR.QC -- for in DO_2OP_ACC_SCALAR_U()
1613 * the 16x16->32 case we must check only the bit corresponding to the T or B in DO_2OP_ACC_SCALAR_U()
1614 * half that we used, but for the 32x32->64 case we propagate if the mask in DO_2OP_ACC_SCALAR_U()
1623 TYPE m = rm; \ in DO_2OP_ACC_SCALAR_U()
1629 LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \ in DO_2OP_ACC_SCALAR_U()
1634 env->vfp.qc[0] = qc; \ in DO_2OP_ACC_SCALAR_U()
1639 static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat)
1641 int64_t r = ((int64_t)n * m) * 2;
1645 static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat) in do_qdmullw() argument
1648 int64_t r = (int64_t)n * m; in do_qdmullw()
1681 TYPE *n = vn, *m = vm; \
1688 LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \
1693 env->vfp.qc[0] = qc; \
1703 static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m) in do_vbrsrb() argument
1705 m &= 0xff; in do_vbrsrb()
1706 if (m == 0) { in do_vbrsrb()
1710 if (m < 8) { in do_vbrsrb()
1711 n >>= 8 - m; in do_vbrsrb()
1716 static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m) in do_vbrsrh() argument
1718 m &= 0xff; in do_vbrsrh()
1719 if (m == 0) { in do_vbrsrh()
1723 if (m < 16) { in do_vbrsrh()
1724 n >>= 16 - m; in do_vbrsrh()
1729 static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m) in do_vbrsrw() argument
1731 m &= 0xff; in do_vbrsrw()
1732 if (m == 0) { in do_vbrsrw()
1736 if (m < 32) { in do_vbrsrw()
1737 n >>= 32 - m; in do_vbrsrw()
1755 TYPE *n = vn, *m = vm; \
1760 (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
1763 (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
1779 DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
1780 DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
1781 DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
1782 DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
1793 TYPE *n = vn, *m = vm; \
1798 n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
1801 n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
1821 DO_DAV_S(vmlsdav, false, +=, -=)
1823 DO_DAV_S(vmlsdavx, true, +=, -=)
1827 * this is implemented with a 72-bit internal accumulator value of which
1829 * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
1830 * is squashed back into 64-bits after each beat.
1838 TYPE *n = vn, *m = vm; \
1843 mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)]; \
1845 mul = -mul; \
1848 mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)]; \
1873 TYPE *m = vm; \ in DO_LDAVH()
1876 ra += m[H##ESIZE(e)]; \ in DO_LDAVH()
1901 TYPE *m = vm; \
1905 ra = FN(ra, m[H##ESIZE(e)]); \
1923 * note that we only take the absolute value of 'm', not 'n'
1925 static int64_t do_maxa(int64_t n, int64_t m)
1927 if (m < 0) {
1928 m = -m;
1930 return MAX(n, m);
1933 static int64_t do_mina(int64_t n, int64_t m) in do_mina() argument
1935 if (m < 0) { in do_mina()
1936 m = -m; in do_mina()
1938 return MIN(n, m); in do_mina()
1962 TYPE *m = vm, *n = vn; \ in DO_VMAXMINV_S()
1966 int64_t m0 = m[H##ESIZE(e)]; \ in DO_VMAXMINV_S()
1967 uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0); \ in DO_VMAXMINV_S()
1988 TYPE *m = vm; \
1991 ra += (LTYPE)m[H4(e)]; \
2006 TYPE *d = vd, *m = vm; \
2011 FN(m[H##ESIZE(e)], shift), mask); \
2020 TYPE *d = vd, *m = vm; \
2027 FN(m[H##ESIZE(e)], shift, &sat), mask); \
2031 env->vfp.qc[0] = qc; \
2036 /* provide unsigned 2-op shift helpers for all sizes */
2065 /* Shift-and-insert; we always work with 64 bits at a time */
2070 uint64_t *d = vd, *m = vm; \
2078 * this because it would try to shift by an out-of-range \
2088 uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) | \
2098 #define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
2099 #define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
2109 * Long shifts taking half-sized inputs from top or bottom of the input
2110 * vector and producing a double-width result. ESIZE, TYPE are for
2113 * because the long shift is strictly left-only.
2120 TYPE *m = vm; \
2125 LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift; \
2149 LTYPE *m = vm; \
2155 TYPE r = FN(m[H##LESIZE(le)], shift); \
2170 return (x >> sh) + ((x >> (sh - 1)) & 1);
2181 return (x >> sh) + ((x >> (sh - 1)) & 1); in do_srshr()
2210 LTYPE *m = vm; \
2218 TYPE r = FN(m[H##LESIZE(le)], shift, &sat); \
2223 env->vfp.qc[0] = qc; \
2244 #define DO_SHRN_SB(N, M, SATP) \ argument
2245 do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP)
2246 #define DO_SHRN_UB(N, M, SATP) \ argument
2247 do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP)
2248 #define DO_SHRUN_B(N, M, SATP) \ argument
2249 do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP)
2251 #define DO_SHRN_SH(N, M, SATP) \ argument
2252 do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP)
2253 #define DO_SHRN_UH(N, M, SATP) \ argument
2254 do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP)
2255 #define DO_SHRUN_H(N, M, SATP) \ argument
2256 do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP)
2258 #define DO_RSHRN_SB(N, M, SATP) \ argument
2259 do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP)
2260 #define DO_RSHRN_UB(N, M, SATP) \ argument
2261 do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP)
2262 #define DO_RSHRUN_B(N, M, SATP) \ argument
2263 do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP)
2265 #define DO_RSHRN_SH(N, M, SATP) \ argument
2266 do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP)
2267 #define DO_RSHRN_UH(N, M, SATP) \ argument
2268 do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP)
2269 #define DO_RSHRUN_H(N, M, SATP) \ argument
2270 do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP)
2289 LTYPE *m = vm; \ in DO_VSHRN_SAT_SB()
2296 m[H##LESIZE(le)], mask); \ in DO_VSHRN_SAT_SB()
2309 LTYPE *m = vm; \
2317 TYPE r = FN(m[H##LESIZE(le)], &sat); \
2322 env->vfp.qc[0] = qc; \
2373 * For each 32-bit element, we shift it left, bringing in the
2393 rdm = d[H4(e)] >> (32 - shift);
2404 return do_sqrshl_d(n, -(int8_t)shift, false, NULL); in HELPER()
2414 return do_sqrshl_d(n, (int8_t)shift, false, &env->QF); in HELPER()
2419 return do_uqrshl_d(n, (int8_t)shift, false, &env->QF); in HELPER()
2424 return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF); in HELPER()
2429 return do_uqrshl_d(n, (int8_t)shift, true, &env->QF); in HELPER()
2432 /* Operate on 64-bit values, but saturate at 48 bits */
2438 if (shift <= -48) { in do_sqrshl48_d()
2446 src >>= -shift - 1; in do_sqrshl48_d()
2449 val = src >> -shift; in do_sqrshl48_d()
2468 /* Operate on 64-bit values, but saturate at 48 bits */
2474 if (shift <= -(48 + round)) { in do_uqrshl48_d()
2478 val = src >> (-shift - 1); in do_uqrshl48_d()
2481 val = src >> -shift; in do_uqrshl48_d()
2502 return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF); in HELPER()
2507 return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF); in HELPER()
2512 return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF); in HELPER()
2517 return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF); in HELPER()
2522 return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF); in HELPER()
2527 return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF); in HELPER()
2585 offset -= imm; in do_sub_wrap()
2595 * P0 bits for non-executed beats (where eci_mask is 0) are unchanged. in DO_VIDUP_ALL()
2603 TYPE *n = vn, *m = vm; \ in DO_VIDUP_ALL()
2610 bool r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)]); \ in DO_VIDUP_ALL()
2616 env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \ in DO_VIDUP_ALL()
2638 env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
2659 #define DO_EQ(N, M) ((N) == (M)) argument
2660 #define DO_NE(N, M) ((N) != (M)) argument
2661 #define DO_EQ(N, M) ((N) == (M)) argument
2662 #define DO_EQ(N, M) ((N) == (M)) argument
2663 #define DO_GE(N, M) ((N) >= (M)) argument
2664 #define DO_LT(N, M) ((N) < (M)) argument
2665 #define DO_GT(N, M) ((N) > (M)) argument
2666 #define DO_LE(N, M) ((N) <= (M)) argument
2684 uint64_t *d = vd, *n = vn, *m = vm; local
2686 uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
2689 uint64_t r = m[H8(e)];
2703 * This insn is itself subject to predication and to beat-wise execution, in HELPER()
2708 uint16_t beatpred = ~env->v7m.vpr & mask; in HELPER()
2709 env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask); in HELPER()
2717 * ltpmask in mve_element_mask(), but we have pre-calculated
2729 env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask); in HELPER()
2736 TYPE *d = vd, *m = vm; \
2742 mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)], &sat), mask); \
2746 env->vfp.qc[0] = qc; \
2758 #define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP)
2759 #define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP)
2760 #define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP)
2778 STYPE *m = vm; \
2782 UTYPE r = DO_ABS(m[H##ESIZE(e)]); \
2797 * 2-operand floating point. Note that if an element is partially
2798 * predicated we must do the FP operation to update the non-predicated
2806 TYPE *d = vd, *n = vn, *m = vm; \
2816 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2822 r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \
2877 TYPE *d = vd, *n = vn, *m = vm; \ in DO_2OP_FP_ALL()
2889 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ in DO_2OP_FP_ALL()
2896 r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)], fpst); \ in DO_2OP_FP_ALL()
2898 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst); \ in DO_2OP_FP_ALL()
2916 TYPE *d = vd, *n = vn, *m = vm; \
2926 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2936 r = TYPE##_muladd(r, m[H##ESIZE(e)], d[H##ESIZE(e)], \
2952 TYPE *d = vd, *n = vn, *m = vm; \
2963 fpst0 = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2975 e1 = m[H##ESIZE(e)]; \
2977 e3 = m[H##ESIZE(e + 1)]; \
2981 e1 = TYPE##_chs(m[H##ESIZE(e + 1)]); \
2983 e3 = m[H##ESIZE(e)]; \
2987 e1 = TYPE##_chs(m[H##ESIZE(e)]); \
2989 e3 = TYPE##_chs(m[H##ESIZE(e + 1)]); \
2993 e1 = m[H##ESIZE(e + 1)]; \
2995 e3 = TYPE##_chs(m[H##ESIZE(e)]); \
3009 #define DO_VCMULH(N, M, D, S) float16_mul(N, M, S) argument
3010 #define DO_VCMULS(N, M, D, S) float32_mul(N, M, S) argument
3012 #define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S) argument
3013 #define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S) argument
3038 TYPE r, m = rm; \
3047 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3053 r = FN(n[H##ESIZE(e)], m, fpst); \
3072 TYPE r, m = rm; \
3081 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3087 r = FN(n[H##ESIZE(e)], m, d[H##ESIZE(e)], 0, fpst); \
3094 #define DO_VFMAS_SCALARH(N, M, D, F, S) float16_muladd(N, D, M, F, S) argument
3095 #define DO_VFMAS_SCALARS(N, M, D, F, S) float32_muladd(N, D, M, F, S) argument
3110 TYPE *m = vm; \
3113 &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3116 TYPE v = m[H##ESIZE(e)]; \
3150 TYPE *n = vn, *m = vm; \
3163 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3169 r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \
3174 env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
3196 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3207 env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
3250 TYPE *d = vd, *m = vm; \
3260 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3266 r = FN(m[H##ESIZE(e)], shift, fpst); \
3286 TYPE *d = vd, *m = vm; \
3293 &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3306 r = FN(m[H##ESIZE(e)], 0, fpst); \
3318 #define DO_VRINT_RM_H(M, F, S) helper_rinth(M, S) argument
3319 #define DO_VRINT_RM_S(M, F, S) helper_rints(M, S) argument
3331 uint32_t *m = vm; local
3334 bool ieee = !(env->vfp.fpcr & FPCR_AHP);
3338 float_status *base_fpst = &env->vfp.fp_status[FPST_STD];
3351 r = float32_to_float16(m[H4(e)], ieee, fpst);
3361 uint16_t *m = vm; in do_vcvt_hs() local
3364 bool ieee = !(env->vfp.fpcr & FPCR_AHP); in do_vcvt_hs()
3368 float_status *base_fpst = &env->vfp.fp_status[FPST_STD]; in do_vcvt_hs()
3381 r = float16_to_float32(m[H2(e * 2 + top)], ieee, fpst); in do_vcvt_hs()
3408 TYPE *d = vd, *m = vm; \
3418 fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3424 r = FN(m[H##ESIZE(e)], fpst); \