arm/tcg/mve_helper.c

2  * M-profile MVE Operations
24 #include "exec/helper-proto.h"
25 #include "accel/tcg/cpu-ldst.h"
39     if ((env->condexec_bits & 0xf) != 0) {  in mve_eci_mask()
43     eci = env->condexec_bits >> 4;  in mve_eci_mask()
66      *  (3) low-overhead-branch tail predication will mask out part  in mve_element_mask()
70      * We combine all these into a 16-bit result with the same semantics  in mve_element_mask()
72      * 8-bit vector ops will look at all bits of the result;  in mve_element_mask()
73      * 16-bit ops will look at bits 0, 2, 4, ...;  in mve_element_mask()
74      * 32-bit ops will look at bits 0, 4, 8 and 12.  in mve_element_mask()
76      * the 4-bit slice of the mask corresponding to a single beat.  in mve_element_mask()
78     uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);  in mve_element_mask()
80     if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {  in mve_element_mask()
83     if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {  in mve_element_mask()
87     if (env->v7m.ltpsize < 4 &&  in mve_element_mask()
88         env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {  in mve_element_mask()
95         int masklen = env->regs[14] << env->v7m.ltpsize;  in mve_element_mask()
112     uint32_t vpr = env->v7m.vpr;  in mve_advance_vpt()
117     if ((env->condexec_bits & 0xf) == 0) {  in mve_advance_vpt()
118         env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?  in mve_advance_vpt()
147     env->v7m.vpr = vpr;  in mve_advance_vpt()
268  * 64-bit accesses are slightly different: they are done as two 32-bit
270  * and with a single 32-bit offset in the first of the two Qm elements.
273  * stored in the even-beat element.
293                 m[H4(e & ~1)] = addr - 4;                               \
319                 m[H4(e & ~1)] = addr - 4;                               \
380  * one 32-bit memory access per beat.  in DO_VLDR64_SG()
559             for (e = 3; e >= 0; e--) {                                  \
645             for (e = 3; e >= 0; e--) {                                  \
670             for (e = 1; e >= 0; e--) {                                  \
775      * into the 32-bit value, so we only need to write the 32-bit  in HELPER()
799 #define DO_CLS_B(N)   (clrsb32(N) - 24)
800 #define DO_CLS_H(N)   (clrsb32(N) - 16)
806 #define DO_CLZ_B(N)   (clz32(N) - 24)
807 #define DO_CLZ_H(N)   (clz32(N) - 16)
824 #define DO_ABS(N) ((N) < 0 ? -(N) : (N))
836 #define DO_NEG(N)    (-(N))
850  * All these insns work at 64-bit widths.
886 /* provide unsigned 2-op helpers for all sizes */
892 /* provide signed 2-op helpers for all sizes */
899  * "Long" operations where two half-sized inputs (taken from either the
900  * top or the bottom of the input vector) produce a double-width result.
932             env->vfp.qc[0] = qc;                                        \
937 /* provide unsigned 2-op helpers for all sizes */
943 /* provide signed 2-op helpers for all sizes */
962 #define DO_SUB(N, M) ((N) - (M))
1048 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))  in DO_2OP_S()
1065     return ((uint64_t)n - m) >> 1;  in do_vhsub_u()
1070     return ((int64_t)n - m) >> 1;  in do_vhsub_s()
1117         env->vfp.fpsr &= ~FPSR_NZCV_MASK;  in DO_2OP_S()
1118         env->vfp.fpsr |= carry_in * FPSR_C;  in DO_2OP_S()
1125     bool carry_in = env->vfp.fpsr & FPSR_C;  in HELPER()
1131     bool carry_in = env->vfp.fpsr & FPSR_C;  in HELPER()
1132     do_vadc(env, vd, vn, vm, -1, carry_in, false);  in HELPER()
1143     do_vadc(env, vd, vn, vm, -1, 1, true);  in HELPER()
1158                 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]);         \
1197 #define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
1198 #define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
1199 #define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
1201 #define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
1202 #define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
1203 #define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
1207  * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
1281  * (A * B - C * D) etc for VQDMLSDH.  in DO_2OP_SAT_S()
1295                             m[H##ESIZE(e - XCHG)],                      \  in DO_2OP_SAT_S()
1296                             n[H##ESIZE(e + (1 - 2 * XCHG))],            \  in DO_2OP_SAT_S()
1297                             m[H##ESIZE(e + (1 - XCHG))],                \  in DO_2OP_SAT_S()
1304             env->vfp.qc[0] = qc;                                        \  in DO_2OP_SAT_S()
1334      * bring it back into the non-saturated range. However, if  in do_vqdmladh_w()
1353     int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);  in do_vqdmlsdh_b()
1360     int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);  in do_vqdmlsdh_h()
1438             env->vfp.qc[0] = qc;                                        \
1476             env->vfp.qc[0] = qc;                                        \
1481 /* provide unsigned 2-op scalar helpers for all sizes */
1546      * bring it back into the non-saturated range. However, if  in do_vqdmlah_w()
1612  * whether to propagate a saturation indication into FPSCR.QC -- for  in DO_2OP_ACC_SCALAR_U()
1613  * the 16x16->32 case we must check only the bit corresponding to the T or B  in DO_2OP_ACC_SCALAR_U()
1614  * half that we used, but for the 32x32->64 case we propagate if the mask  in DO_2OP_ACC_SCALAR_U()
1634             env->vfp.qc[0] = qc;                                        \  in DO_2OP_ACC_SCALAR_U()
1693             env->vfp.qc[0] = qc;                                        \
1711         n >>= 8 - m;  in do_vbrsrb()
1724         n >>= 16 - m;  in do_vbrsrh()
1737         n >>= 32 - m;  in do_vbrsrw()
1760                         (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
1779 DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
1780 DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
1781 DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
1782 DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
1798                         n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)];     \
1821 DO_DAV_S(vmlsdav, false, +=, -=)
1823 DO_DAV_S(vmlsdavx, true, +=, -=)
1827  * this is implemented with a 72-bit internal accumulator value of which
1829  * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
1830  * is squashed back into 64-bits after each beat.
1843                     mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)];        \
1845                         mul = -mul;                                     \
1928         m = -m;
1936         m = -m;  in do_mina()
1967                 uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0);  \  in DO_VMAXMINV_S()
2031             env->vfp.qc[0] = qc;                                \
2036 /* provide unsigned 2-op shift helpers for all sizes */
2065 /* Shift-and-insert; we always work with 64 bits at a time */
2078              * this because it would try to shift by an out-of-range    \
2098 #define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
2099 #define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
2109  * Long shifts taking half-sized inputs from top or bottom of the input
2110  * vector and producing a double-width result. ESIZE, TYPE are for
2113  * because the long shift is strictly left-only.
2170         return (x >> sh) + ((x >> (sh - 1)) & 1);
2181         return (x >> sh) + ((x >> (sh - 1)) & 1);  in do_srshr()
2223             env->vfp.qc[0] = qc;                                \
2322             env->vfp.qc[0] = qc;                                        \
2373      * For each 32-bit element, we shift it left, bringing in the
2393                 rdm = d[H4(e)] >> (32 - shift);
2404     return do_sqrshl_d(n, -(int8_t)shift, false, NULL);  in HELPER()
2414     return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);  in HELPER()
2419     return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);  in HELPER()
2424     return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);  in HELPER()
2429     return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);  in HELPER()
2432 /* Operate on 64-bit values, but saturate at 48 bits */
2438     if (shift <= -48) {  in do_sqrshl48_d()
2446             src >>= -shift - 1;  in do_sqrshl48_d()
2449             val = src >> -shift;  in do_sqrshl48_d()
2468 /* Operate on 64-bit values, but saturate at 48 bits */
2474     if (shift <= -(48 + round)) {  in do_uqrshl48_d()
2478             val = src >> (-shift - 1);  in do_uqrshl48_d()
2481             val = src >> -shift;  in do_uqrshl48_d()
2502     return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);  in HELPER()
2507     return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);  in HELPER()
2512     return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);  in HELPER()
2517     return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);  in HELPER()
2522     return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);  in HELPER()
2527     return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);  in HELPER()
2585     offset -= imm;  in do_sub_wrap()
2595  * P0 bits for non-executed beats (where eci_mask is 0) are unchanged.  in DO_VIDUP_ALL()
2616         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \  in DO_VIDUP_ALL()
2638         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
2686     uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
2703      * This insn is itself subject to predication and to beat-wise execution,  in HELPER()
2708     uint16_t beatpred = ~env->v7m.vpr & mask;  in HELPER()
2709     env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask);  in HELPER()
2717  * ltpmask in mve_element_mask(), but we have pre-calculated
2729     env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask);  in HELPER()
2746             env->vfp.qc[0] = qc;                                        \
2758 #define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP)
2759 #define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP)
2760 #define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP)
2797  * 2-operand floating point. Note that if an element is partially
2798  * predicated we must do the FP operation to update the non-predicated
2816             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2889             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \  in DO_2OP_FP_ALL()
2898                 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst);   \  in DO_2OP_FP_ALL()
2912 #define DO_VFMA(OP, ESIZE, TYPE, CHS)                                   \  argument
2926             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2933             if (CHS) {                                                  \
2963             fpst0 = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3047             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3081             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3113             &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3163             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3174         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
3196             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3207         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
3260             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3293             &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD];  \
3334     bool ieee = !(env->vfp.fpcr & FPCR_AHP);
3338     float_status *base_fpst = &env->vfp.fp_status[FPST_STD];
3364     bool ieee = !(env->vfp.fpcr & FPCR_AHP);  in do_vcvt_hs()
3368     float_status *base_fpst = &env->vfp.fp_status[FPST_STD];  in do_vcvt_hs()
3418             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \