xref: /qemu/target/arm/tcg/mve_helper.c (revision 13ab3764ea0c0e43e2258e1755c08551c8eb8e60)
1 /*
2  * M-profile MVE Operations
3  *
4  * Copyright (c) 2021 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "vec_internal.h"
24 #include "exec/helper-proto.h"
25 #include "accel/tcg/cpu-ldst.h"
26 #include "tcg/tcg.h"
27 #include "fpu/softfloat.h"
28 #include "crypto/clmul.h"
29 
30 static uint16_t mve_eci_mask(CPUARMState *env)
31 {
32     /*
33      * Return the mask of which elements in the MVE vector correspond
34      * to beats being executed. The mask has 1 bits for executed lanes
35      * and 0 bits where ECI says this beat was already executed.
36      */
37     int eci;
38 
39     if ((env->condexec_bits & 0xf) != 0) {
40         return 0xffff;
41     }
42 
43     eci = env->condexec_bits >> 4;
44     switch (eci) {
45     case ECI_NONE:
46         return 0xffff;
47     case ECI_A0:
48         return 0xfff0;
49     case ECI_A0A1:
50         return 0xff00;
51     case ECI_A0A1A2:
52     case ECI_A0A1A2B0:
53         return 0xf000;
54     default:
55         g_assert_not_reached();
56     }
57 }
58 
59 static uint16_t mve_element_mask(CPUARMState *env)
60 {
61     /*
62      * Return the mask of which elements in the MVE vector should be
63      * updated. This is a combination of multiple things:
64      *  (1) by default, we update every lane in the vector
65      *  (2) VPT predication stores its state in the VPR register;
66      *  (3) low-overhead-branch tail predication will mask out part
67      *      the vector on the final iteration of the loop
68      *  (4) if EPSR.ECI is set then we must execute only some beats
69      *      of the insn
70      * We combine all these into a 16-bit result with the same semantics
71      * as VPR.P0: 0 to mask the lane, 1 if it is active.
72      * 8-bit vector ops will look at all bits of the result;
73      * 16-bit ops will look at bits 0, 2, 4, ...;
74      * 32-bit ops will look at bits 0, 4, 8 and 12.
75      * Compare pseudocode GetCurInstrBeat(), though that only returns
76      * the 4-bit slice of the mask corresponding to a single beat.
77      */
78     uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
79 
80     if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {
81         mask |= 0xff;
82     }
83     if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {
84         mask |= 0xff00;
85     }
86 
87     if (env->v7m.ltpsize < 4 &&
88         env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {
89         /*
90          * Tail predication active, and this is the last loop iteration.
91          * The element size is (1 << ltpsize), and we only want to process
92          * loopcount elements, so we want to retain the least significant
93          * (loopcount * esize) predicate bits and zero out bits above that.
94          */
95         int masklen = env->regs[14] << env->v7m.ltpsize;
96         assert(masklen <= 16);
97         uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
98         mask &= ltpmask;
99     }
100 
101     /*
102      * ECI bits indicate which beats are already executed;
103      * we handle this by effectively predicating them out.
104      */
105     mask &= mve_eci_mask(env);
106     return mask;
107 }
108 
109 static void mve_advance_vpt(CPUARMState *env)
110 {
111     /* Advance the VPT and ECI state if necessary */
112     uint32_t vpr = env->v7m.vpr;
113     unsigned mask01, mask23;
114     uint16_t inv_mask;
115     uint16_t eci_mask = mve_eci_mask(env);
116 
117     if ((env->condexec_bits & 0xf) == 0) {
118         env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?
119             (ECI_A0 << 4) : (ECI_NONE << 4);
120     }
121 
122     if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) {
123         /* VPT not enabled, nothing to do */
124         return;
125     }
126 
127     /* Invert P0 bits if needed, but only for beats we actually executed */
128     mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01);
129     mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23);
130     /* Start by assuming we invert all bits corresponding to executed beats */
131     inv_mask = eci_mask;
132     if (mask01 <= 8) {
133         /* MASK01 says don't invert low half of P0 */
134         inv_mask &= ~0xff;
135     }
136     if (mask23 <= 8) {
137         /* MASK23 says don't invert high half of P0 */
138         inv_mask &= ~0xff00;
139     }
140     vpr ^= inv_mask;
141     /* Only update MASK01 if beat 1 executed */
142     if (eci_mask & 0xf0) {
143         vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1);
144     }
145     /* Beat 3 always executes, so update MASK23 */
146     vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1);
147     env->v7m.vpr = vpr;
148 }
149 
150 /* For loads, predicated lanes are zeroed instead of keeping their old values */
151 #define DO_VLDR(OP, MFLAG, MSIZE, MTYPE, LDTYPE, ESIZE, TYPE)           \
152     void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr)    \
153     {                                                                   \
154         TYPE *d = vd;                                                   \
155         uint16_t mask = mve_element_mask(env);                          \
156         uint16_t eci_mask = mve_eci_mask(env);                          \
157         unsigned b, e;                                                  \
158         int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env));            \
159         MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx);        \
160         /*                                                              \
161          * R_SXTM allows the dest reg to become UNKNOWN for abandoned   \
162          * beats so we don't care if we update part of the dest and     \
163          * then take an exception.                                      \
164          */                                                             \
165         for (b = 0, e = 0; b < 16; b += ESIZE, e++) {                   \
166             if (eci_mask & (1 << b)) {                                  \
167                 d[H##ESIZE(e)] = (mask & (1 << b)) ?                    \
168                     (MTYPE)cpu_##LDTYPE##_mmu(env, addr, oi, GETPC()) : 0;\
169             }                                                           \
170             addr += MSIZE;                                              \
171         }                                                               \
172         mve_advance_vpt(env);                                           \
173     }
174 
175 #define DO_VSTR(OP, MFLAG, MSIZE, STTYPE, ESIZE, TYPE)                  \
176     void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr)    \
177     {                                                                   \
178         TYPE *d = vd;                                                   \
179         uint16_t mask = mve_element_mask(env);                          \
180         unsigned b, e;                                                  \
181         int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env));            \
182         MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx);        \
183         for (b = 0, e = 0; b < 16; b += ESIZE, e++) {                   \
184             if (mask & (1 << b)) {                                      \
185                 cpu_##STTYPE##_mmu(env, addr, d[H##ESIZE(e)], oi, GETPC()); \
186             }                                                           \
187             addr += MSIZE;                                              \
188         }                                                               \
189         mve_advance_vpt(env);                                           \
190     }
191 
192 DO_VLDR(vldrb, MO_UB, 1, uint8_t, ldb, 1, uint8_t)
193 DO_VLDR(vldrh, MO_TEUW, 2, uint16_t, ldw, 2, uint16_t)
194 DO_VLDR(vldrw, MO_TEUL, 4, uint32_t, ldl, 4, uint32_t)
195 
196 DO_VSTR(vstrb, MO_UB, 1, stb, 1, uint8_t)
197 DO_VSTR(vstrh, MO_TEUW, 2, stw, 2, uint16_t)
198 DO_VSTR(vstrw, MO_TEUL, 4, stl, 4, uint32_t)
199 
200 DO_VLDR(vldrb_sh, MO_SB, 1, int8_t, ldb, 2, int16_t)
201 DO_VLDR(vldrb_sw, MO_SB, 1, int8_t, ldb, 4, int32_t)
202 DO_VLDR(vldrb_uh, MO_UB, 1, uint8_t, ldb, 2, uint16_t)
203 DO_VLDR(vldrb_uw, MO_UB, 1, uint8_t, ldb, 4, uint32_t)
204 DO_VLDR(vldrh_sw, MO_TESW, 2, int16_t, ldw, 4, int32_t)
205 DO_VLDR(vldrh_uw, MO_TEUW, 2, uint16_t, ldw, 4, uint32_t)
206 
207 DO_VSTR(vstrb_h, MO_UB, 1, stb, 2, int16_t)
208 DO_VSTR(vstrb_w, MO_UB, 1, stb, 4, int32_t)
209 DO_VSTR(vstrh_w, MO_TEUW, 2, stw, 4, int32_t)
210 
211 #undef DO_VLDR
212 #undef DO_VSTR
213 
214 /*
215  * Gather loads/scatter stores. Here each element of Qm specifies
216  * an offset to use from the base register Rm. In the _os_ versions
217  * that offset is scaled by the element size.
218  * For loads, predicated lanes are zeroed instead of retaining
219  * their previous values.
220  */
221 #define DO_VLDR_SG(OP, MFLAG, MTYPE, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB)\
222     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
223                           uint32_t base)                                \
224     {                                                                   \
225         TYPE *d = vd;                                                   \
226         OFFTYPE *m = vm;                                                \
227         uint16_t mask = mve_element_mask(env);                          \
228         uint16_t eci_mask = mve_eci_mask(env);                          \
229         unsigned e;                                                     \
230         uint32_t addr;                                                  \
231         int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env));            \
232         MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx);        \
233         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
234             if (!(eci_mask & 1)) {                                      \
235                 continue;                                               \
236             }                                                           \
237             addr = ADDRFN(base, m[H##ESIZE(e)]);                        \
238             d[H##ESIZE(e)] = (mask & 1) ?                               \
239                 (MTYPE)cpu_##LDTYPE##_mmu(env, addr, oi, GETPC()) : 0;  \
240             if (WB) {                                                   \
241                 m[H##ESIZE(e)] = addr;                                  \
242             }                                                           \
243         }                                                               \
244         mve_advance_vpt(env);                                           \
245     }
246 
247 /* We know here TYPE is unsigned so always the same as the offset type */
248 #define DO_VSTR_SG(OP, MFLAG, STTYPE, ESIZE, TYPE, ADDRFN, WB)          \
249     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
250                           uint32_t base)                                \
251     {                                                                   \
252         TYPE *d = vd;                                                   \
253         TYPE *m = vm;                                                   \
254         uint16_t mask = mve_element_mask(env);                          \
255         uint16_t eci_mask = mve_eci_mask(env);                          \
256         unsigned e;                                                     \
257         uint32_t addr;                                                  \
258         int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env));            \
259         MemOpIdx oi = make_memop_idx(MFLAG | MO_ALIGN, mmu_idx);        \
260         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
261             if (!(eci_mask & 1)) {                                      \
262                 continue;                                               \
263             }                                                           \
264             addr = ADDRFN(base, m[H##ESIZE(e)]);                        \
265             if (mask & 1) {                                             \
266                 cpu_##STTYPE##_mmu(env, addr, d[H##ESIZE(e)], oi, GETPC()); \
267             }                                                           \
268             if (WB) {                                                   \
269                 m[H##ESIZE(e)] = addr;                                  \
270             }                                                           \
271         }                                                               \
272         mve_advance_vpt(env);                                           \
273     }
274 
275 /*
276  * 64-bit accesses are slightly different: they are done as two 32-bit
277  * accesses, controlled by the predicate mask for the relevant beat,
278  * and with a single 32-bit offset in the first of the two Qm elements.
279  * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little).
280  * Address writeback happens on the odd beats and updates the address
281  * stored in the even-beat element.
282  */
283 #define DO_VLDR64_SG(OP, ADDRFN, WB)                                    \
284     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
285                           uint32_t base)                                \
286     {                                                                   \
287         uint32_t *d = vd;                                               \
288         uint32_t *m = vm;                                               \
289         uint16_t mask = mve_element_mask(env);                          \
290         uint16_t eci_mask = mve_eci_mask(env);                          \
291         unsigned e;                                                     \
292         uint32_t addr;                                                  \
293         int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env));            \
294         MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx);      \
295         for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) {      \
296             if (!(eci_mask & 1)) {                                      \
297                 continue;                                               \
298             }                                                           \
299             addr = ADDRFN(base, m[H4(e & ~1)]);                         \
300             addr += 4 * (e & 1);                                        \
301             d[H4(e)] = (mask & 1) ? cpu_ldl_mmu(env, addr, oi, GETPC()) : 0; \
302             if (WB && (e & 1)) {                                        \
303                 m[H4(e & ~1)] = addr - 4;                               \
304             }                                                           \
305         }                                                               \
306         mve_advance_vpt(env);                                           \
307     }
308 
309 #define DO_VSTR64_SG(OP, ADDRFN, WB)                                    \
310     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm,         \
311                           uint32_t base)                                \
312     {                                                                   \
313         uint32_t *d = vd;                                               \
314         uint32_t *m = vm;                                               \
315         uint16_t mask = mve_element_mask(env);                          \
316         uint16_t eci_mask = mve_eci_mask(env);                          \
317         unsigned e;                                                     \
318         uint32_t addr;                                                  \
319         int mmu_idx = arm_to_core_mmu_idx(arm_mmu_idx(env));            \
320         MemOpIdx oi = make_memop_idx(MO_TEUL | MO_ALIGN, mmu_idx);      \
321         for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) {      \
322             if (!(eci_mask & 1)) {                                      \
323                 continue;                                               \
324             }                                                           \
325             addr = ADDRFN(base, m[H4(e & ~1)]);                         \
326             addr += 4 * (e & 1);                                        \
327             if (mask & 1) {                                             \
328                 cpu_stl_mmu(env, addr, d[H4(e)], oi, GETPC());          \
329             }                                                           \
330             if (WB && (e & 1)) {                                        \
331                 m[H4(e & ~1)] = addr - 4;                               \
332             }                                                           \
333         }                                                               \
334         mve_advance_vpt(env);                                           \
335     }
336 
337 #define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET))
338 #define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1))
339 #define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2))
340 #define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3))
341 
342 DO_VLDR_SG(vldrb_sg_sh, MO_SB, int8_t, ldb, 2, int16_t, uint16_t, ADDR_ADD, false)
343 DO_VLDR_SG(vldrb_sg_sw, MO_SB, int8_t, ldb, 4, int32_t, uint32_t, ADDR_ADD, false)
344 DO_VLDR_SG(vldrh_sg_sw, MO_TESW, int16_t, ldw, 4, int32_t, uint32_t, ADDR_ADD, false)
345 
346 DO_VLDR_SG(vldrb_sg_ub, MO_UB, uint8_t, ldb, 1, uint8_t, uint8_t, ADDR_ADD, false)
347 DO_VLDR_SG(vldrb_sg_uh, MO_UB, uint8_t, ldb, 2, uint16_t, uint16_t, ADDR_ADD, false)
348 DO_VLDR_SG(vldrb_sg_uw, MO_UB, uint8_t, ldb, 4, uint32_t, uint32_t, ADDR_ADD, false)
349 DO_VLDR_SG(vldrh_sg_uh, MO_TEUW, uint16_t, ldw, 2, uint16_t, uint16_t, ADDR_ADD, false)
350 DO_VLDR_SG(vldrh_sg_uw, MO_TEUW, uint16_t, ldw, 4, uint32_t, uint32_t, ADDR_ADD, false)
351 DO_VLDR_SG(vldrw_sg_uw, MO_TEUL, uint32_t, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false)
352 DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false)
353 
354 DO_VLDR_SG(vldrh_sg_os_sw, MO_TESW, int16_t, ldw, 4,
355            int32_t, uint32_t, ADDR_ADD_OSH, false)
356 DO_VLDR_SG(vldrh_sg_os_uh, MO_TEUW, uint16_t, ldw, 2,
357            uint16_t, uint16_t, ADDR_ADD_OSH, false)
358 DO_VLDR_SG(vldrh_sg_os_uw, MO_TEUW, uint16_t, ldw, 4,
359            uint32_t, uint32_t, ADDR_ADD_OSH, false)
360 DO_VLDR_SG(vldrw_sg_os_uw, MO_TEUL, uint32_t, ldl, 4,
361            uint32_t, uint32_t, ADDR_ADD_OSW, false)
362 DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false)
363 
364 DO_VSTR_SG(vstrb_sg_ub, MO_UB, stb, 1, uint8_t, ADDR_ADD, false)
365 DO_VSTR_SG(vstrb_sg_uh, MO_UB, stb, 2, uint16_t, ADDR_ADD, false)
366 DO_VSTR_SG(vstrb_sg_uw, MO_UB, stb, 4, uint32_t, ADDR_ADD, false)
367 DO_VSTR_SG(vstrh_sg_uh, MO_TEUW, stw, 2, uint16_t, ADDR_ADD, false)
368 DO_VSTR_SG(vstrh_sg_uw, MO_TEUW, stw, 4, uint32_t, ADDR_ADD, false)
369 DO_VSTR_SG(vstrw_sg_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD, false)
370 DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false)
371 
372 DO_VSTR_SG(vstrh_sg_os_uh, MO_TEUW, stw, 2, uint16_t, ADDR_ADD_OSH, false)
373 DO_VSTR_SG(vstrh_sg_os_uw, MO_TEUW, stw, 4, uint32_t, ADDR_ADD_OSH, false)
374 DO_VSTR_SG(vstrw_sg_os_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD_OSW, false)
375 DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false)
376 
377 DO_VLDR_SG(vldrw_sg_wb_uw, MO_TEUL, uint32_t, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true)
378 DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true)
379 DO_VSTR_SG(vstrw_sg_wb_uw, MO_TEUL, stl, 4, uint32_t, ADDR_ADD, true)
380 DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
381 
382 /*
383  * Deinterleaving loads/interleaving stores.
384  *
385  * For these helpers we are passed the index of the first Qreg
386  * (VLD2/VST2 will also access Qn+1, VLD4/VST4 access Qn .. Qn+3)
387  * and the value of the base address register Rn.
388  * The helpers are specialized for pattern and element size, so
389  * for instance vld42h is VLD4 with pattern 2, element size MO_16.
390  *
391  * These insns are beatwise but not predicated, so we must honour ECI,
392  * but need not look at mve_element_mask().
393  *
394  * The pseudocode implements these insns with multiple memory accesses
395  * of the element size, but rules R_VVVG and R_FXDM permit us to make
396  * one 32-bit memory access per beat.
397  */
398 #define DO_VLD4B(OP, O1, O2, O3, O4)                                    \
399     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
400                           uint32_t base)                                \
401     {                                                                   \
402         int beat, e;                                                    \
403         uint16_t mask = mve_eci_mask(env);                              \
404         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
405         uint32_t addr, data;                                            \
406         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
407             if ((mask & 1) == 0) {                                      \
408                 /* ECI says skip this beat */                           \
409                 continue;                                               \
410             }                                                           \
411             addr = base + off[beat] * 4;                                \
412             data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
413             for (e = 0; e < 4; e++, data >>= 8) {                       \
414                 uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
415                 qd[H1(off[beat])] = data;                               \
416             }                                                           \
417         }                                                               \
418     }
419 
420 #define DO_VLD4H(OP, O1, O2)                                            \
421     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
422                           uint32_t base)                                \
423     {                                                                   \
424         int beat;                                                       \
425         uint16_t mask = mve_eci_mask(env);                              \
426         static const uint8_t off[4] = { O1, O1, O2, O2 };               \
427         uint32_t addr, data;                                            \
428         int y; /* y counts 0 2 0 2 */                                   \
429         uint16_t *qd;                                                   \
430         for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) {   \
431             if ((mask & 1) == 0) {                                      \
432                 /* ECI says skip this beat */                           \
433                 continue;                                               \
434             }                                                           \
435             addr = base + off[beat] * 8 + (beat & 1) * 4;               \
436             data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
437             qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y);             \
438             qd[H2(off[beat])] = data;                                   \
439             data >>= 16;                                                \
440             qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1);         \
441             qd[H2(off[beat])] = data;                                   \
442         }                                                               \
443     }
444 
445 #define DO_VLD4W(OP, O1, O2, O3, O4)                                    \
446     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
447                           uint32_t base)                                \
448     {                                                                   \
449         int beat;                                                       \
450         uint16_t mask = mve_eci_mask(env);                              \
451         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
452         uint32_t addr, data;                                            \
453         uint32_t *qd;                                                   \
454         int y;                                                          \
455         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
456             if ((mask & 1) == 0) {                                      \
457                 /* ECI says skip this beat */                           \
458                 continue;                                               \
459             }                                                           \
460             addr = base + off[beat] * 4;                                \
461             data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
462             y = (beat + (O1 & 2)) & 3;                                  \
463             qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y);             \
464             qd[H4(off[beat] >> 2)] = data;                              \
465         }                                                               \
466     }
467 
468 DO_VLD4B(vld40b, 0, 1, 10, 11)
469 DO_VLD4B(vld41b, 2, 3, 12, 13)
470 DO_VLD4B(vld42b, 4, 5, 14, 15)
471 DO_VLD4B(vld43b, 6, 7, 8, 9)
472 
473 DO_VLD4H(vld40h, 0, 5)
474 DO_VLD4H(vld41h, 1, 6)
475 DO_VLD4H(vld42h, 2, 7)
476 DO_VLD4H(vld43h, 3, 4)
477 
478 DO_VLD4W(vld40w, 0, 1, 10, 11)
479 DO_VLD4W(vld41w, 2, 3, 12, 13)
480 DO_VLD4W(vld42w, 4, 5, 14, 15)
481 DO_VLD4W(vld43w, 6, 7, 8, 9)
482 
483 #define DO_VLD2B(OP, O1, O2, O3, O4)                                    \
484     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
485                           uint32_t base)                                \
486     {                                                                   \
487         int beat, e;                                                    \
488         uint16_t mask = mve_eci_mask(env);                              \
489         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
490         uint32_t addr, data;                                            \
491         uint8_t *qd;                                                    \
492         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
493             if ((mask & 1) == 0) {                                      \
494                 /* ECI says skip this beat */                           \
495                 continue;                                               \
496             }                                                           \
497             addr = base + off[beat] * 2;                                \
498             data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
499             for (e = 0; e < 4; e++, data >>= 8) {                       \
500                 qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1));    \
501                 qd[H1(off[beat] + (e >> 1))] = data;                    \
502             }                                                           \
503         }                                                               \
504     }
505 
506 #define DO_VLD2H(OP, O1, O2, O3, O4)                                    \
507     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
508                           uint32_t base)                                \
509     {                                                                   \
510         int beat;                                                       \
511         uint16_t mask = mve_eci_mask(env);                              \
512         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
513         uint32_t addr, data;                                            \
514         int e;                                                          \
515         uint16_t *qd;                                                   \
516         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
517             if ((mask & 1) == 0) {                                      \
518                 /* ECI says skip this beat */                           \
519                 continue;                                               \
520             }                                                           \
521             addr = base + off[beat] * 4;                                \
522             data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
523             for (e = 0; e < 2; e++, data >>= 16) {                      \
524                 qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e);         \
525                 qd[H2(off[beat])] = data;                               \
526             }                                                           \
527         }                                                               \
528     }
529 
530 #define DO_VLD2W(OP, O1, O2, O3, O4)                                    \
531     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
532                           uint32_t base)                                \
533     {                                                                   \
534         int beat;                                                       \
535         uint16_t mask = mve_eci_mask(env);                              \
536         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
537         uint32_t addr, data;                                            \
538         uint32_t *qd;                                                   \
539         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
540             if ((mask & 1) == 0) {                                      \
541                 /* ECI says skip this beat */                           \
542                 continue;                                               \
543             }                                                           \
544             addr = base + off[beat];                                    \
545             data = cpu_ldl_le_data_ra(env, addr, GETPC());              \
546             qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1));    \
547             qd[H4(off[beat] >> 3)] = data;                              \
548         }                                                               \
549     }
550 
551 DO_VLD2B(vld20b, 0, 2, 12, 14)
552 DO_VLD2B(vld21b, 4, 6, 8, 10)
553 
554 DO_VLD2H(vld20h, 0, 1, 6, 7)
555 DO_VLD2H(vld21h, 2, 3, 4, 5)
556 
557 DO_VLD2W(vld20w, 0, 4, 24, 28)
558 DO_VLD2W(vld21w, 8, 12, 16, 20)
559 
560 #define DO_VST4B(OP, O1, O2, O3, O4)                                    \
561     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
562                           uint32_t base)                                \
563     {                                                                   \
564         int beat, e;                                                    \
565         uint16_t mask = mve_eci_mask(env);                              \
566         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
567         uint32_t addr, data;                                            \
568         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
569             if ((mask & 1) == 0) {                                      \
570                 /* ECI says skip this beat */                           \
571                 continue;                                               \
572             }                                                           \
573             addr = base + off[beat] * 4;                                \
574             data = 0;                                                   \
575             for (e = 3; e >= 0; e--) {                                  \
576                 uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
577                 data = (data << 8) | qd[H1(off[beat])];                 \
578             }                                                           \
579             cpu_stl_le_data_ra(env, addr, data, GETPC());               \
580         }                                                               \
581     }
582 
583 #define DO_VST4H(OP, O1, O2)                                            \
584     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
585                           uint32_t base)                                \
586     {                                                                   \
587         int beat;                                                       \
588         uint16_t mask = mve_eci_mask(env);                              \
589         static const uint8_t off[4] = { O1, O1, O2, O2 };               \
590         uint32_t addr, data;                                            \
591         int y; /* y counts 0 2 0 2 */                                   \
592         uint16_t *qd;                                                   \
593         for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) {   \
594             if ((mask & 1) == 0) {                                      \
595                 /* ECI says skip this beat */                           \
596                 continue;                                               \
597             }                                                           \
598             addr = base + off[beat] * 8 + (beat & 1) * 4;               \
599             qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y);             \
600             data = qd[H2(off[beat])];                                   \
601             qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1);         \
602             data |= qd[H2(off[beat])] << 16;                            \
603             cpu_stl_le_data_ra(env, addr, data, GETPC());               \
604         }                                                               \
605     }
606 
607 #define DO_VST4W(OP, O1, O2, O3, O4)                                    \
608     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
609                           uint32_t base)                                \
610     {                                                                   \
611         int beat;                                                       \
612         uint16_t mask = mve_eci_mask(env);                              \
613         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
614         uint32_t addr, data;                                            \
615         uint32_t *qd;                                                   \
616         int y;                                                          \
617         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
618             if ((mask & 1) == 0) {                                      \
619                 /* ECI says skip this beat */                           \
620                 continue;                                               \
621             }                                                           \
622             addr = base + off[beat] * 4;                                \
623             y = (beat + (O1 & 2)) & 3;                                  \
624             qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y);             \
625             data = qd[H4(off[beat] >> 2)];                              \
626             cpu_stl_le_data_ra(env, addr, data, GETPC());               \
627         }                                                               \
628     }
629 
630 DO_VST4B(vst40b, 0, 1, 10, 11)
631 DO_VST4B(vst41b, 2, 3, 12, 13)
632 DO_VST4B(vst42b, 4, 5, 14, 15)
633 DO_VST4B(vst43b, 6, 7, 8, 9)
634 
635 DO_VST4H(vst40h, 0, 5)
636 DO_VST4H(vst41h, 1, 6)
637 DO_VST4H(vst42h, 2, 7)
638 DO_VST4H(vst43h, 3, 4)
639 
640 DO_VST4W(vst40w, 0, 1, 10, 11)
641 DO_VST4W(vst41w, 2, 3, 12, 13)
642 DO_VST4W(vst42w, 4, 5, 14, 15)
643 DO_VST4W(vst43w, 6, 7, 8, 9)
644 
645 #define DO_VST2B(OP, O1, O2, O3, O4)                                    \
646     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
647                           uint32_t base)                                \
648     {                                                                   \
649         int beat, e;                                                    \
650         uint16_t mask = mve_eci_mask(env);                              \
651         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
652         uint32_t addr, data;                                            \
653         uint8_t *qd;                                                    \
654         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
655             if ((mask & 1) == 0) {                                      \
656                 /* ECI says skip this beat */                           \
657                 continue;                                               \
658             }                                                           \
659             addr = base + off[beat] * 2;                                \
660             data = 0;                                                   \
661             for (e = 3; e >= 0; e--) {                                  \
662                 qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1));    \
663                 data = (data << 8) | qd[H1(off[beat] + (e >> 1))];      \
664             }                                                           \
665             cpu_stl_le_data_ra(env, addr, data, GETPC());               \
666         }                                                               \
667     }
668 
669 #define DO_VST2H(OP, O1, O2, O3, O4)                                    \
670     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
671                           uint32_t base)                                \
672     {                                                                   \
673         int beat;                                                       \
674         uint16_t mask = mve_eci_mask(env);                              \
675         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
676         uint32_t addr, data;                                            \
677         int e;                                                          \
678         uint16_t *qd;                                                   \
679         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
680             if ((mask & 1) == 0) {                                      \
681                 /* ECI says skip this beat */                           \
682                 continue;                                               \
683             }                                                           \
684             addr = base + off[beat] * 4;                                \
685             data = 0;                                                   \
686             for (e = 1; e >= 0; e--) {                                  \
687                 qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e);         \
688                 data = (data << 16) | qd[H2(off[beat])];                \
689             }                                                           \
690             cpu_stl_le_data_ra(env, addr, data, GETPC());               \
691         }                                                               \
692     }
693 
694 #define DO_VST2W(OP, O1, O2, O3, O4)                                    \
695     void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx,             \
696                           uint32_t base)                                \
697     {                                                                   \
698         int beat;                                                       \
699         uint16_t mask = mve_eci_mask(env);                              \
700         static const uint8_t off[4] = { O1, O2, O3, O4 };               \
701         uint32_t addr, data;                                            \
702         uint32_t *qd;                                                   \
703         for (beat = 0; beat < 4; beat++, mask >>= 4) {                  \
704             if ((mask & 1) == 0) {                                      \
705                 /* ECI says skip this beat */                           \
706                 continue;                                               \
707             }                                                           \
708             addr = base + off[beat];                                    \
709             qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1));    \
710             data = qd[H4(off[beat] >> 3)];                              \
711             cpu_stl_le_data_ra(env, addr, data, GETPC());               \
712         }                                                               \
713     }
714 
715 DO_VST2B(vst20b, 0, 2, 12, 14)
716 DO_VST2B(vst21b, 4, 6, 8, 10)
717 
718 DO_VST2H(vst20h, 0, 1, 6, 7)
719 DO_VST2H(vst21h, 2, 3, 4, 5)
720 
721 DO_VST2W(vst20w, 0, 4, 24, 28)
722 DO_VST2W(vst21w, 8, 12, 16, 20)
723 
724 /*
725  * The mergemask(D, R, M) macro performs the operation "*D = R" but
726  * storing only the bytes which correspond to 1 bits in M,
727  * leaving other bytes in *D unchanged. We use _Generic
728  * to select the correct implementation based on the type of D.
729  */
730 
731 static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask)
732 {
733     if (mask & 1) {
734         *d = r;
735     }
736 }
737 
738 static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask)
739 {
740     mergemask_ub((uint8_t *)d, r, mask);
741 }
742 
743 static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask)
744 {
745     uint16_t bmask = expand_pred_b(mask);
746     *d = (*d & ~bmask) | (r & bmask);
747 }
748 
749 static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask)
750 {
751     mergemask_uh((uint16_t *)d, r, mask);
752 }
753 
754 static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask)
755 {
756     uint32_t bmask = expand_pred_b(mask);
757     *d = (*d & ~bmask) | (r & bmask);
758 }
759 
760 static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask)
761 {
762     mergemask_uw((uint32_t *)d, r, mask);
763 }
764 
765 static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask)
766 {
767     uint64_t bmask = expand_pred_b(mask);
768     *d = (*d & ~bmask) | (r & bmask);
769 }
770 
771 static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask)
772 {
773     mergemask_uq((uint64_t *)d, r, mask);
774 }
775 
776 #define mergemask(D, R, M)                      \
777     _Generic(D,                                 \
778              uint8_t *: mergemask_ub,           \
779              int8_t *:  mergemask_sb,           \
780              uint16_t *: mergemask_uh,          \
781              int16_t *:  mergemask_sh,          \
782              uint32_t *: mergemask_uw,          \
783              int32_t *:  mergemask_sw,          \
784              uint64_t *: mergemask_uq,          \
785              int64_t *:  mergemask_sq)(D, R, M)
786 
787 void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val)
788 {
789     /*
790      * The generated code already replicated an 8 or 16 bit constant
791      * into the 32-bit value, so we only need to write the 32-bit
792      * value to all elements of the Qreg, allowing for predication.
793      */
794     uint32_t *d = vd;
795     uint16_t mask = mve_element_mask(env);
796     unsigned e;
797     for (e = 0; e < 16 / 4; e++, mask >>= 4) {
798         mergemask(&d[H4(e)], val, mask);
799     }
800     mve_advance_vpt(env);
801 }
802 
803 #define DO_1OP(OP, ESIZE, TYPE, FN)                                     \
804     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
805     {                                                                   \
806         TYPE *d = vd, *m = vm;                                          \
807         uint16_t mask = mve_element_mask(env);                          \
808         unsigned e;                                                     \
809         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
810             mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask);       \
811         }                                                               \
812         mve_advance_vpt(env);                                           \
813     }
814 
815 #define DO_CLS_B(N)   (clrsb32(N) - 24)
816 #define DO_CLS_H(N)   (clrsb32(N) - 16)
817 
818 DO_1OP(vclsb, 1, int8_t, DO_CLS_B)
819 DO_1OP(vclsh, 2, int16_t, DO_CLS_H)
820 DO_1OP(vclsw, 4, int32_t, clrsb32)
821 
822 #define DO_CLZ_B(N)   (clz32(N) - 24)
823 #define DO_CLZ_H(N)   (clz32(N) - 16)
824 
825 DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B)
826 DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H)
827 DO_1OP(vclzw, 4, uint32_t, clz32)
828 
829 DO_1OP(vrev16b, 2, uint16_t, bswap16)
830 DO_1OP(vrev32b, 4, uint32_t, bswap32)
831 DO_1OP(vrev32h, 4, uint32_t, hswap32)
832 DO_1OP(vrev64b, 8, uint64_t, bswap64)
833 DO_1OP(vrev64h, 8, uint64_t, hswap64)
834 DO_1OP(vrev64w, 8, uint64_t, wswap64)
835 
836 #define DO_NOT(N) (~(N))
837 
838 DO_1OP(vmvn, 8, uint64_t, DO_NOT)
839 
840 #define DO_ABS(N) ((N) < 0 ? -(N) : (N))
841 #define DO_FABSH(N)  ((N) & dup_const(MO_16, 0x7fff))
842 #define DO_FABSS(N)  ((N) & dup_const(MO_32, 0x7fffffff))
843 
844 DO_1OP(vabsb, 1, int8_t, DO_ABS)
845 DO_1OP(vabsh, 2, int16_t, DO_ABS)
846 DO_1OP(vabsw, 4, int32_t, DO_ABS)
847 
848 /* We can do these 64 bits at a time */
849 DO_1OP(vfabsh, 8, uint64_t, DO_FABSH)
850 DO_1OP(vfabss, 8, uint64_t, DO_FABSS)
851 
852 #define DO_NEG(N)    (-(N))
853 #define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000))
854 #define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000))
855 
856 DO_1OP(vnegb, 1, int8_t, DO_NEG)
857 DO_1OP(vnegh, 2, int16_t, DO_NEG)
858 DO_1OP(vnegw, 4, int32_t, DO_NEG)
859 
860 /* We can do these 64 bits at a time */
861 DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH)
862 DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS)
863 
864 /*
865  * 1 operand immediates: Vda is destination and possibly also one source.
866  * All these insns work at 64-bit widths.
867  */
868 #define DO_1OP_IMM(OP, FN)                                              \
869     void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm)    \
870     {                                                                   \
871         uint64_t *da = vda;                                             \
872         uint16_t mask = mve_element_mask(env);                          \
873         unsigned e;                                                     \
874         for (e = 0; e < 16 / 8; e++, mask >>= 8) {                      \
875             mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask);            \
876         }                                                               \
877         mve_advance_vpt(env);                                           \
878     }
879 
880 #define DO_MOVI(N, I) (I)
881 #define DO_ANDI(N, I) ((N) & (I))
882 #define DO_ORRI(N, I) ((N) | (I))
883 
884 DO_1OP_IMM(vmovi, DO_MOVI)
885 DO_1OP_IMM(vandi, DO_ANDI)
886 DO_1OP_IMM(vorri, DO_ORRI)
887 
888 #define DO_2OP(OP, ESIZE, TYPE, FN)                                     \
889     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
890                                 void *vd, void *vn, void *vm)           \
891     {                                                                   \
892         TYPE *d = vd, *n = vn, *m = vm;                                 \
893         uint16_t mask = mve_element_mask(env);                          \
894         unsigned e;                                                     \
895         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
896             mergemask(&d[H##ESIZE(e)],                                  \
897                       FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask);        \
898         }                                                               \
899         mve_advance_vpt(env);                                           \
900     }
901 
902 /* provide unsigned 2-op helpers for all sizes */
903 #define DO_2OP_U(OP, FN)                        \
904     DO_2OP(OP##b, 1, uint8_t, FN)               \
905     DO_2OP(OP##h, 2, uint16_t, FN)              \
906     DO_2OP(OP##w, 4, uint32_t, FN)
907 
908 /* provide signed 2-op helpers for all sizes */
909 #define DO_2OP_S(OP, FN)                        \
910     DO_2OP(OP##b, 1, int8_t, FN)                \
911     DO_2OP(OP##h, 2, int16_t, FN)               \
912     DO_2OP(OP##w, 4, int32_t, FN)
913 
914 /*
915  * "Long" operations where two half-sized inputs (taken from either the
916  * top or the bottom of the input vector) produce a double-width result.
917  * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output.
918  */
919 #define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)               \
920     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
921     {                                                                   \
922         LTYPE *d = vd;                                                  \
923         TYPE *n = vn, *m = vm;                                          \
924         uint16_t mask = mve_element_mask(env);                          \
925         unsigned le;                                                    \
926         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
927             LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)],              \
928                          m[H##ESIZE(le * 2 + TOP)]);                    \
929             mergemask(&d[H##LESIZE(le)], r, mask);                      \
930         }                                                               \
931         mve_advance_vpt(env);                                           \
932     }
933 
934 #define DO_2OP_SAT(OP, ESIZE, TYPE, FN)                                 \
935     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
936     {                                                                   \
937         TYPE *d = vd, *n = vn, *m = vm;                                 \
938         uint16_t mask = mve_element_mask(env);                          \
939         unsigned e;                                                     \
940         bool qc = false;                                                \
941         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
942             bool sat = false;                                           \
943             TYPE r_ = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat);         \
944             mergemask(&d[H##ESIZE(e)], r_, mask);                       \
945             qc |= sat & mask & 1;                                       \
946         }                                                               \
947         if (qc) {                                                       \
948             env->vfp.qc[0] = qc;                                        \
949         }                                                               \
950         mve_advance_vpt(env);                                           \
951     }
952 
953 /* provide unsigned 2-op helpers for all sizes */
954 #define DO_2OP_SAT_U(OP, FN)                    \
955     DO_2OP_SAT(OP##b, 1, uint8_t, FN)           \
956     DO_2OP_SAT(OP##h, 2, uint16_t, FN)          \
957     DO_2OP_SAT(OP##w, 4, uint32_t, FN)
958 
959 /* provide signed 2-op helpers for all sizes */
960 #define DO_2OP_SAT_S(OP, FN)                    \
961     DO_2OP_SAT(OP##b, 1, int8_t, FN)            \
962     DO_2OP_SAT(OP##h, 2, int16_t, FN)           \
963     DO_2OP_SAT(OP##w, 4, int32_t, FN)
964 
965 #define DO_AND(N, M)  ((N) & (M))
966 #define DO_BIC(N, M)  ((N) & ~(M))
967 #define DO_ORR(N, M)  ((N) | (M))
968 #define DO_ORN(N, M)  ((N) | ~(M))
969 #define DO_EOR(N, M)  ((N) ^ (M))
970 
971 DO_2OP(vand, 8, uint64_t, DO_AND)
972 DO_2OP(vbic, 8, uint64_t, DO_BIC)
973 DO_2OP(vorr, 8, uint64_t, DO_ORR)
974 DO_2OP(vorn, 8, uint64_t, DO_ORN)
975 DO_2OP(veor, 8, uint64_t, DO_EOR)
976 
977 #define DO_ADD(N, M) ((N) + (M))
978 #define DO_SUB(N, M) ((N) - (M))
979 #define DO_MUL(N, M) ((N) * (M))
980 
981 DO_2OP_U(vadd, DO_ADD)
982 DO_2OP_U(vsub, DO_SUB)
983 DO_2OP_U(vmul, DO_MUL)
984 
985 DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL)
986 DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL)
987 DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL)
988 DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL)
989 DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL)
990 DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL)
991 
992 DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL)
993 DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL)
994 DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL)
995 DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL)
996 DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL)
997 DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
998 
999 /*
1000  * Polynomial multiply. We can always do this generating 64 bits
1001  * of the result at a time, so we don't need to use DO_2OP_L.
1002  */
1003 DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even)
1004 DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd)
1005 DO_2OP(vmullpbw, 8, uint64_t, clmul_16x2_even)
1006 DO_2OP(vmullptw, 8, uint64_t, clmul_16x2_odd)
1007 
1008 /*
1009  * Because the computation type is at least twice as large as required,
1010  * these work for both signed and unsigned source types.
1011  */
1012 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
1013 {
1014     return (n * m) >> 8;
1015 }
1016 
1017 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
1018 {
1019     return (n * m) >> 16;
1020 }
1021 
1022 static inline uint32_t do_mulh_w(int64_t n, int64_t m)
1023 {
1024     return (n * m) >> 32;
1025 }
1026 
1027 static inline uint8_t do_rmulh_b(int32_t n, int32_t m)
1028 {
1029     return (n * m + (1U << 7)) >> 8;
1030 }
1031 
1032 static inline uint16_t do_rmulh_h(int32_t n, int32_t m)
1033 {
1034     return (n * m + (1U << 15)) >> 16;
1035 }
1036 
1037 static inline uint32_t do_rmulh_w(int64_t n, int64_t m)
1038 {
1039     return (n * m + (1U << 31)) >> 32;
1040 }
1041 
1042 DO_2OP(vmulhsb, 1, int8_t, do_mulh_b)
1043 DO_2OP(vmulhsh, 2, int16_t, do_mulh_h)
1044 DO_2OP(vmulhsw, 4, int32_t, do_mulh_w)
1045 DO_2OP(vmulhub, 1, uint8_t, do_mulh_b)
1046 DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h)
1047 DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w)
1048 
1049 DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b)
1050 DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h)
1051 DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w)
1052 DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b)
1053 DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h)
1054 DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w)
1055 
1056 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
1057 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
1058 
1059 DO_2OP_S(vmaxs, DO_MAX)
1060 DO_2OP_U(vmaxu, DO_MAX)
1061 DO_2OP_S(vmins, DO_MIN)
1062 DO_2OP_U(vminu, DO_MIN)
1063 
1064 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
1065 
1066 DO_2OP_S(vabds, DO_ABD)
1067 DO_2OP_U(vabdu, DO_ABD)
1068 
1069 static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m)
1070 {
1071     return ((uint64_t)n + m) >> 1;
1072 }
1073 
1074 static inline int32_t do_vhadd_s(int32_t n, int32_t m)
1075 {
1076     return ((int64_t)n + m) >> 1;
1077 }
1078 
1079 static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m)
1080 {
1081     return ((uint64_t)n - m) >> 1;
1082 }
1083 
1084 static inline int32_t do_vhsub_s(int32_t n, int32_t m)
1085 {
1086     return ((int64_t)n - m) >> 1;
1087 }
1088 
1089 DO_2OP_S(vhadds, do_vhadd_s)
1090 DO_2OP_U(vhaddu, do_vhadd_u)
1091 DO_2OP_S(vhsubs, do_vhsub_s)
1092 DO_2OP_U(vhsubu, do_vhsub_u)
1093 
1094 #define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
1095 #define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
1096 #define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
1097 #define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
1098 
1099 DO_2OP_S(vshls, DO_VSHLS)
1100 DO_2OP_U(vshlu, DO_VSHLU)
1101 DO_2OP_S(vrshls, DO_VRSHLS)
1102 DO_2OP_U(vrshlu, DO_VRSHLU)
1103 
1104 #define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1)
1105 #define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1)
1106 
1107 DO_2OP_S(vrhadds, DO_RHADD_S)
1108 DO_2OP_U(vrhaddu, DO_RHADD_U)
1109 
1110 static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m,
1111                     uint32_t inv, uint32_t carry_in, bool update_flags)
1112 {
1113     uint16_t mask = mve_element_mask(env);
1114     unsigned e;
1115 
1116     /* If any additions trigger, we will update flags. */
1117     if (mask & 0x1111) {
1118         update_flags = true;
1119     }
1120 
1121     for (e = 0; e < 16 / 4; e++, mask >>= 4) {
1122         uint64_t r = carry_in;
1123         r += n[H4(e)];
1124         r += m[H4(e)] ^ inv;
1125         if (mask & 1) {
1126             carry_in = r >> 32;
1127         }
1128         mergemask(&d[H4(e)], r, mask);
1129     }
1130 
1131     if (update_flags) {
1132         /* Store C, clear NZV. */
1133         env->vfp.fpsr &= ~FPSR_NZCV_MASK;
1134         env->vfp.fpsr |= carry_in * FPSR_C;
1135     }
1136     mve_advance_vpt(env);
1137 }
1138 
1139 void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm)
1140 {
1141     bool carry_in = env->vfp.fpsr & FPSR_C;
1142     do_vadc(env, vd, vn, vm, 0, carry_in, false);
1143 }
1144 
1145 void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm)
1146 {
1147     bool carry_in = env->vfp.fpsr & FPSR_C;
1148     do_vadc(env, vd, vn, vm, -1, carry_in, false);
1149 }
1150 
1151 
1152 void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm)
1153 {
1154     do_vadc(env, vd, vn, vm, 0, 0, true);
1155 }
1156 
1157 void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm)
1158 {
1159     do_vadc(env, vd, vn, vm, -1, 1, true);
1160 }
1161 
1162 #define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1)                             \
1163     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
1164     {                                                                   \
1165         TYPE *d = vd, *n = vn, *m = vm;                                 \
1166         uint16_t mask = mve_element_mask(env);                          \
1167         unsigned e;                                                     \
1168         TYPE r[16 / ESIZE];                                             \
1169         /* Calculate all results first to avoid overwriting inputs */   \
1170         for (e = 0; e < 16 / ESIZE; e++) {                              \
1171             if (!(e & 1)) {                                             \
1172                 r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]);         \
1173             } else {                                                    \
1174                 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]);         \
1175             }                                                           \
1176         }                                                               \
1177         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1178             mergemask(&d[H##ESIZE(e)], r[e], mask);                     \
1179         }                                                               \
1180         mve_advance_vpt(env);                                           \
1181     }
1182 
1183 #define DO_VCADD_ALL(OP, FN0, FN1)              \
1184     DO_VCADD(OP##b, 1, int8_t, FN0, FN1)        \
1185     DO_VCADD(OP##h, 2, int16_t, FN0, FN1)       \
1186     DO_VCADD(OP##w, 4, int32_t, FN0, FN1)
1187 
1188 DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD)
1189 DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB)
1190 DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s)
1191 DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s)
1192 
1193 static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s)
1194 {
1195     if (val > max) {
1196         *s = true;
1197         return max;
1198     } else if (val < min) {
1199         *s = true;
1200         return min;
1201     }
1202     return val;
1203 }
1204 
1205 #define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s)
1206 #define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s)
1207 #define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s)
1208 
1209 #define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s)
1210 #define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s)
1211 #define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s)
1212 
1213 #define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
1214 #define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
1215 #define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
1216 
1217 #define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
1218 #define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
1219 #define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
1220 
1221 /*
1222  * For QDMULH and QRDMULH we simplify "double and shift by esize" into
1223  * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
1224  */
1225 #define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \
1226                                         INT8_MIN, INT8_MAX, s)
1227 #define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \
1228                                         INT16_MIN, INT16_MAX, s)
1229 #define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \
1230                                         INT32_MIN, INT32_MAX, s)
1231 
1232 #define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \
1233                                          INT8_MIN, INT8_MAX, s)
1234 #define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \
1235                                          INT16_MIN, INT16_MAX, s)
1236 #define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \
1237                                          INT32_MIN, INT32_MAX, s)
1238 
1239 DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B)
1240 DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H)
1241 DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W)
1242 
1243 DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B)
1244 DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H)
1245 DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W)
1246 
1247 DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B)
1248 DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H)
1249 DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W)
1250 DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B)
1251 DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H)
1252 DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W)
1253 
1254 DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B)
1255 DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H)
1256 DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W)
1257 DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B)
1258 DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H)
1259 DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W)
1260 
1261 /*
1262  * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs()
1263  * and friends wanting a uint32_t* sat and our needing a bool*.
1264  */
1265 #define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp)                        \
1266     ({                                                                  \
1267         uint32_t su32 = 0;                                              \
1268         typeof(N) qrshl_ret = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \
1269         if (su32) {                                                     \
1270             *satp = true;                                               \
1271         }                                                               \
1272         qrshl_ret;                                                      \
1273     })
1274 
1275 #define DO_SQSHL_OP(N, M, satp) \
1276     WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp)
1277 #define DO_UQSHL_OP(N, M, satp) \
1278     WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp)
1279 #define DO_SQRSHL_OP(N, M, satp) \
1280     WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
1281 #define DO_UQRSHL_OP(N, M, satp) \
1282     WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
1283 #define DO_SUQSHL_OP(N, M, satp) \
1284     WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp)
1285 
1286 DO_2OP_SAT_S(vqshls, DO_SQSHL_OP)
1287 DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP)
1288 DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP)
1289 DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP)
1290 
1291 /*
1292  * Multiply add dual returning high half
1293  * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of
1294  * whether to add the rounding constant, and the pointer to the
1295  * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant",
1296  * saturate to twice the input size and return the high half; or
1297  * (A * B - C * D) etc for VQDMLSDH.
1298  */
1299 #define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN)                \
1300     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
1301                                 void *vm)                               \
1302     {                                                                   \
1303         TYPE *d = vd, *n = vn, *m = vm;                                 \
1304         uint16_t mask = mve_element_mask(env);                          \
1305         unsigned e;                                                     \
1306         bool qc = false;                                                \
1307         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1308             bool sat = false;                                           \
1309             if ((e & 1) == XCHG) {                                      \
1310                 TYPE vqdmladh_ret = FN(n[H##ESIZE(e)],                  \
1311                             m[H##ESIZE(e - XCHG)],                      \
1312                             n[H##ESIZE(e + (1 - 2 * XCHG))],            \
1313                             m[H##ESIZE(e + (1 - XCHG))],                \
1314                             ROUND, &sat);                               \
1315                 mergemask(&d[H##ESIZE(e)], vqdmladh_ret, mask);         \
1316                 qc |= sat & mask & 1;                                   \
1317             }                                                           \
1318         }                                                               \
1319         if (qc) {                                                       \
1320             env->vfp.qc[0] = qc;                                        \
1321         }                                                               \
1322         mve_advance_vpt(env);                                           \
1323     }
1324 
1325 static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d,
1326                             int round, bool *sat)
1327 {
1328     int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7);
1329     return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
1330 }
1331 
1332 static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d,
1333                              int round, bool *sat)
1334 {
1335     int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15);
1336     return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
1337 }
1338 
1339 static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d,
1340                              int round, bool *sat)
1341 {
1342     int64_t m1 = (int64_t)a * b;
1343     int64_t m2 = (int64_t)c * d;
1344     int64_t r;
1345     /*
1346      * Architecturally we should do the entire add, double, round
1347      * and then check for saturation. We do three saturating adds,
1348      * but we need to be careful about the order. If the first
1349      * m1 + m2 saturates then it's impossible for the *2+rc to
1350      * bring it back into the non-saturated range. However, if
1351      * m1 + m2 is negative then it's possible that doing the doubling
1352      * would take the intermediate result below INT64_MAX and the
1353      * addition of the rounding constant then brings it back in range.
1354      * So we add half the rounding constant before doubling rather
1355      * than adding the rounding constant after the doubling.
1356      */
1357     if (sadd64_overflow(m1, m2, &r) ||
1358         sadd64_overflow(r, (round << 30), &r) ||
1359         sadd64_overflow(r, r, &r)) {
1360         *sat = true;
1361         return r < 0 ? INT32_MAX : INT32_MIN;
1362     }
1363     return r >> 32;
1364 }
1365 
1366 static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d,
1367                             int round, bool *sat)
1368 {
1369     int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);
1370     return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
1371 }
1372 
1373 static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d,
1374                              int round, bool *sat)
1375 {
1376     int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);
1377     return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
1378 }
1379 
1380 static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d,
1381                              int round, bool *sat)
1382 {
1383     int64_t m1 = (int64_t)a * b;
1384     int64_t m2 = (int64_t)c * d;
1385     int64_t r;
1386     /* The same ordering issue as in do_vqdmladh_w applies here too */
1387     if (ssub64_overflow(m1, m2, &r) ||
1388         sadd64_overflow(r, (round << 30), &r) ||
1389         sadd64_overflow(r, r, &r)) {
1390         *sat = true;
1391         return r < 0 ? INT32_MAX : INT32_MIN;
1392     }
1393     return r >> 32;
1394 }
1395 
1396 DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b)
1397 DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h)
1398 DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w)
1399 DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b)
1400 DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h)
1401 DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w)
1402 
1403 DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b)
1404 DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h)
1405 DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w)
1406 DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b)
1407 DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h)
1408 DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w)
1409 
1410 DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b)
1411 DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h)
1412 DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w)
1413 DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b)
1414 DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h)
1415 DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w)
1416 
1417 DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b)
1418 DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h)
1419 DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w)
1420 DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b)
1421 DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h)
1422 DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
1423 
1424 #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN)                              \
1425     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
1426                                 uint32_t rm)                            \
1427     {                                                                   \
1428         TYPE *d = vd, *n = vn;                                          \
1429         TYPE m = rm;                                                    \
1430         uint16_t mask = mve_element_mask(env);                          \
1431         unsigned e;                                                     \
1432         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1433             mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask);    \
1434         }                                                               \
1435         mve_advance_vpt(env);                                           \
1436     }
1437 
1438 #define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN)                          \
1439     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
1440                                 uint32_t rm)                            \
1441     {                                                                   \
1442         TYPE *d = vd, *n = vn;                                          \
1443         TYPE m = rm;                                                    \
1444         uint16_t mask = mve_element_mask(env);                          \
1445         unsigned e;                                                     \
1446         bool qc = false;                                                \
1447         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1448             bool sat = false;                                           \
1449             mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat),     \
1450                       mask);                                            \
1451             qc |= sat & mask & 1;                                       \
1452         }                                                               \
1453         if (qc) {                                                       \
1454             env->vfp.qc[0] = qc;                                        \
1455         }                                                               \
1456         mve_advance_vpt(env);                                           \
1457     }
1458 
1459 /* "accumulating" version where FN takes d as well as n and m */
1460 #define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN)                          \
1461     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
1462                                 uint32_t rm)                            \
1463     {                                                                   \
1464         TYPE *d = vd, *n = vn;                                          \
1465         TYPE m = rm;                                                    \
1466         uint16_t mask = mve_element_mask(env);                          \
1467         unsigned e;                                                     \
1468         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1469             mergemask(&d[H##ESIZE(e)],                                  \
1470                       FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask);     \
1471         }                                                               \
1472         mve_advance_vpt(env);                                           \
1473     }
1474 
1475 #define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN)                      \
1476     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
1477                                 uint32_t rm)                            \
1478     {                                                                   \
1479         TYPE *d = vd, *n = vn;                                          \
1480         TYPE m = rm;                                                    \
1481         uint16_t mask = mve_element_mask(env);                          \
1482         unsigned e;                                                     \
1483         bool qc = false;                                                \
1484         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1485             bool sat = false;                                           \
1486             mergemask(&d[H##ESIZE(e)],                                  \
1487                       FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat),      \
1488                       mask);                                            \
1489             qc |= sat & mask & 1;                                       \
1490         }                                                               \
1491         if (qc) {                                                       \
1492             env->vfp.qc[0] = qc;                                        \
1493         }                                                               \
1494         mve_advance_vpt(env);                                           \
1495     }
1496 
1497 /* provide unsigned 2-op scalar helpers for all sizes */
1498 #define DO_2OP_SCALAR_U(OP, FN)                 \
1499     DO_2OP_SCALAR(OP##b, 1, uint8_t, FN)        \
1500     DO_2OP_SCALAR(OP##h, 2, uint16_t, FN)       \
1501     DO_2OP_SCALAR(OP##w, 4, uint32_t, FN)
1502 #define DO_2OP_SCALAR_S(OP, FN)                 \
1503     DO_2OP_SCALAR(OP##b, 1, int8_t, FN)         \
1504     DO_2OP_SCALAR(OP##h, 2, int16_t, FN)        \
1505     DO_2OP_SCALAR(OP##w, 4, int32_t, FN)
1506 
1507 #define DO_2OP_ACC_SCALAR_U(OP, FN)             \
1508     DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN)    \
1509     DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN)   \
1510     DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN)
1511 
1512 DO_2OP_SCALAR_U(vadd_scalar, DO_ADD)
1513 DO_2OP_SCALAR_U(vsub_scalar, DO_SUB)
1514 DO_2OP_SCALAR_U(vmul_scalar, DO_MUL)
1515 DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s)
1516 DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u)
1517 DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s)
1518 DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u)
1519 
1520 DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B)
1521 DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H)
1522 DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W)
1523 DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B)
1524 DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H)
1525 DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W)
1526 
1527 DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B)
1528 DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H)
1529 DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W)
1530 DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B)
1531 DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H)
1532 DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W)
1533 
1534 DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B)
1535 DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H)
1536 DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W)
1537 DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
1538 DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
1539 DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
1540 
1541 static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat)
1542 {
1543     int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7);
1544     return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
1545 }
1546 
1547 static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c,
1548                            int round, bool *sat)
1549 {
1550     int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15);
1551     return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
1552 }
1553 
1554 static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c,
1555                             int round, bool *sat)
1556 {
1557     /*
1558      * Architecturally we should do the entire add, double, round
1559      * and then check for saturation. We do three saturating adds,
1560      * but we need to be careful about the order. If the first
1561      * m1 + m2 saturates then it's impossible for the *2+rc to
1562      * bring it back into the non-saturated range. However, if
1563      * m1 + m2 is negative then it's possible that doing the doubling
1564      * would take the intermediate result below INT64_MAX and the
1565      * addition of the rounding constant then brings it back in range.
1566      * So we add half the rounding constant and half the "c << esize"
1567      * before doubling rather than adding the rounding constant after
1568      * the doubling.
1569      */
1570     int64_t m1 = (int64_t)a * b;
1571     int64_t m2 = (int64_t)c << 31;
1572     int64_t r;
1573     if (sadd64_overflow(m1, m2, &r) ||
1574         sadd64_overflow(r, (round << 30), &r) ||
1575         sadd64_overflow(r, r, &r)) {
1576         *sat = true;
1577         return r < 0 ? INT32_MAX : INT32_MIN;
1578     }
1579     return r >> 32;
1580 }
1581 
1582 /*
1583  * The *MLAH insns are vector * scalar + vector;
1584  * the *MLASH insns are vector * vector + scalar
1585  */
1586 #define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S)
1587 #define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S)
1588 #define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S)
1589 #define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S)
1590 #define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S)
1591 #define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S)
1592 
1593 #define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S)
1594 #define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S)
1595 #define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S)
1596 #define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S)
1597 #define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S)
1598 #define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S)
1599 
1600 DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B)
1601 DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H)
1602 DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W)
1603 DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B)
1604 DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H)
1605 DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W)
1606 
1607 DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B)
1608 DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H)
1609 DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W)
1610 DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B)
1611 DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H)
1612 DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W)
1613 
1614 /* Vector by scalar plus vector */
1615 #define DO_VMLA(D, N, M) ((N) * (M) + (D))
1616 
1617 DO_2OP_ACC_SCALAR_U(vmla, DO_VMLA)
1618 
1619 /* Vector by vector plus scalar */
1620 #define DO_VMLAS(D, N, M) ((N) * (D) + (M))
1621 
1622 DO_2OP_ACC_SCALAR_U(vmlas, DO_VMLAS)
1623 
1624 /*
1625  * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the
1626  * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type.
1627  * SATMASK specifies which bits of the predicate mask matter for determining
1628  * whether to propagate a saturation indication into FPSCR.QC -- for
1629  * the 16x16->32 case we must check only the bit corresponding to the T or B
1630  * half that we used, but for the 32x32->64 case we propagate if the mask
1631  * bit is set for either half.
1632  */
1633 #define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
1634     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
1635                                 uint32_t rm)                            \
1636     {                                                                   \
1637         LTYPE *d = vd;                                                  \
1638         TYPE *n = vn;                                                   \
1639         TYPE m = rm;                                                    \
1640         uint16_t mask = mve_element_mask(env);                          \
1641         unsigned le;                                                    \
1642         bool qc = false;                                                \
1643         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
1644             bool sat = false;                                           \
1645             LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat);    \
1646             mergemask(&d[H##LESIZE(le)], r, mask);                      \
1647             qc |= sat && (mask & SATMASK);                              \
1648         }                                                               \
1649         if (qc) {                                                       \
1650             env->vfp.qc[0] = qc;                                        \
1651         }                                                               \
1652         mve_advance_vpt(env);                                           \
1653     }
1654 
1655 static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat)
1656 {
1657     int64_t r = ((int64_t)n * m) * 2;
1658     return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat);
1659 }
1660 
1661 static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat)
1662 {
1663     /* The multiply can't overflow, but the doubling might */
1664     int64_t r = (int64_t)n * m;
1665     if (r > INT64_MAX / 2) {
1666         *sat = true;
1667         return INT64_MAX;
1668     } else if (r < INT64_MIN / 2) {
1669         *sat = true;
1670         return INT64_MIN;
1671     } else {
1672         return r * 2;
1673     }
1674 }
1675 
1676 #define SATMASK16B 1
1677 #define SATMASK16T (1 << 2)
1678 #define SATMASK32 ((1 << 4) | 1)
1679 
1680 DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \
1681                     do_qdmullh, SATMASK16B)
1682 DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \
1683                     do_qdmullw, SATMASK32)
1684 DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \
1685                     do_qdmullh, SATMASK16T)
1686 DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \
1687                     do_qdmullw, SATMASK32)
1688 
1689 /*
1690  * Long saturating ops
1691  */
1692 #define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK)  \
1693     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
1694                                 void *vm)                               \
1695     {                                                                   \
1696         LTYPE *d = vd;                                                  \
1697         TYPE *n = vn, *m = vm;                                          \
1698         uint16_t mask = mve_element_mask(env);                          \
1699         unsigned le;                                                    \
1700         bool qc = false;                                                \
1701         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
1702             bool sat = false;                                           \
1703             LTYPE op1 = n[H##ESIZE(le * 2 + TOP)];                      \
1704             LTYPE op2 = m[H##ESIZE(le * 2 + TOP)];                      \
1705             mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask);     \
1706             qc |= sat && (mask & SATMASK);                              \
1707         }                                                               \
1708         if (qc) {                                                       \
1709             env->vfp.qc[0] = qc;                                        \
1710         }                                                               \
1711         mve_advance_vpt(env);                                           \
1712     }
1713 
1714 DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B)
1715 DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
1716 DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T)
1717 DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
1718 
1719 static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m)
1720 {
1721     m &= 0xff;
1722     if (m == 0) {
1723         return 0;
1724     }
1725     n = revbit8(n);
1726     if (m < 8) {
1727         n >>= 8 - m;
1728     }
1729     return n;
1730 }
1731 
1732 static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m)
1733 {
1734     m &= 0xff;
1735     if (m == 0) {
1736         return 0;
1737     }
1738     n = revbit16(n);
1739     if (m < 16) {
1740         n >>= 16 - m;
1741     }
1742     return n;
1743 }
1744 
1745 static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m)
1746 {
1747     m &= 0xff;
1748     if (m == 0) {
1749         return 0;
1750     }
1751     n = revbit32(n);
1752     if (m < 32) {
1753         n >>= 32 - m;
1754     }
1755     return n;
1756 }
1757 
1758 DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb)
1759 DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh)
1760 DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw)
1761 
1762 /*
1763  * Multiply add long dual accumulate ops.
1764  */
1765 #define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC)                 \
1766     uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
1767                                     void *vm, uint64_t a)               \
1768     {                                                                   \
1769         uint16_t mask = mve_element_mask(env);                          \
1770         unsigned e;                                                     \
1771         TYPE *n = vn, *m = vm;                                          \
1772         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1773             if (mask & 1) {                                             \
1774                 if (e & 1) {                                            \
1775                     a ODDACC                                            \
1776                         (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
1777                 } else {                                                \
1778                     a EVENACC                                           \
1779                         (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
1780                 }                                                       \
1781             }                                                           \
1782         }                                                               \
1783         mve_advance_vpt(env);                                           \
1784         return a;                                                       \
1785     }
1786 
1787 DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=)
1788 DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=)
1789 DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=)
1790 DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=)
1791 
1792 DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=)
1793 DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=)
1794 
1795 DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
1796 DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
1797 DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
1798 DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
1799 
1800 /*
1801  * Multiply add dual accumulate ops
1802  */
1803 #define DO_DAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \
1804     uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
1805                                     void *vm, uint32_t a)               \
1806     {                                                                   \
1807         uint16_t mask = mve_element_mask(env);                          \
1808         unsigned e;                                                     \
1809         TYPE *n = vn, *m = vm;                                          \
1810         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1811             if (mask & 1) {                                             \
1812                 if (e & 1) {                                            \
1813                     a ODDACC                                            \
1814                         n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)];     \
1815                 } else {                                                \
1816                     a EVENACC                                           \
1817                         n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)];     \
1818                 }                                                       \
1819             }                                                           \
1820         }                                                               \
1821         mve_advance_vpt(env);                                           \
1822         return a;                                                       \
1823     }
1824 
1825 #define DO_DAV_S(INSN, XCHG, EVENACC, ODDACC)           \
1826     DO_DAV(INSN##b, 1, int8_t, XCHG, EVENACC, ODDACC)   \
1827     DO_DAV(INSN##h, 2, int16_t, XCHG, EVENACC, ODDACC)  \
1828     DO_DAV(INSN##w, 4, int32_t, XCHG, EVENACC, ODDACC)
1829 
1830 #define DO_DAV_U(INSN, XCHG, EVENACC, ODDACC)           \
1831     DO_DAV(INSN##b, 1, uint8_t, XCHG, EVENACC, ODDACC)  \
1832     DO_DAV(INSN##h, 2, uint16_t, XCHG, EVENACC, ODDACC) \
1833     DO_DAV(INSN##w, 4, uint32_t, XCHG, EVENACC, ODDACC)
1834 
1835 DO_DAV_S(vmladavs, false, +=, +=)
1836 DO_DAV_U(vmladavu, false, +=, +=)
1837 DO_DAV_S(vmlsdav, false, +=, -=)
1838 DO_DAV_S(vmladavsx, true, +=, +=)
1839 DO_DAV_S(vmlsdavx, true, +=, -=)
1840 
1841 /*
1842  * Rounding multiply add long dual accumulate high. In the pseudocode
1843  * this is implemented with a 72-bit internal accumulator value of which
1844  * the top 64 bits are returned. We optimize this to avoid having to
1845  * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
1846  * is squashed back into 64-bits after each beat.
1847  */
1848 #define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB)                            \
1849     uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
1850                                     void *vm, uint64_t a)               \
1851     {                                                                   \
1852         uint16_t mask = mve_element_mask(env);                          \
1853         unsigned e;                                                     \
1854         TYPE *n = vn, *m = vm;                                          \
1855         for (e = 0; e < 16 / 4; e++, mask >>= 4) {                      \
1856             if (mask & 1) {                                             \
1857                 LTYPE mul;                                              \
1858                 if (e & 1) {                                            \
1859                     mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)];        \
1860                     if (SUB) {                                          \
1861                         mul = -mul;                                     \
1862                     }                                                   \
1863                 } else {                                                \
1864                     mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)];        \
1865                 }                                                       \
1866                 mul = (mul >> 8) + ((mul >> 7) & 1);                    \
1867                 a += mul;                                               \
1868             }                                                           \
1869         }                                                               \
1870         mve_advance_vpt(env);                                           \
1871         return a;                                                       \
1872     }
1873 
1874 DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false)
1875 DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false)
1876 
1877 DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false)
1878 
1879 DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true)
1880 DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true)
1881 
1882 /* Vector add across vector */
1883 #define DO_VADDV(OP, ESIZE, TYPE)                               \
1884     uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
1885                                     uint32_t ra)                \
1886     {                                                           \
1887         uint16_t mask = mve_element_mask(env);                  \
1888         unsigned e;                                             \
1889         TYPE *m = vm;                                           \
1890         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
1891             if (mask & 1) {                                     \
1892                 ra += m[H##ESIZE(e)];                           \
1893             }                                                   \
1894         }                                                       \
1895         mve_advance_vpt(env);                                   \
1896         return ra;                                              \
1897     }                                                           \
1898 
1899 DO_VADDV(vaddvsb, 1, int8_t)
1900 DO_VADDV(vaddvsh, 2, int16_t)
1901 DO_VADDV(vaddvsw, 4, int32_t)
1902 DO_VADDV(vaddvub, 1, uint8_t)
1903 DO_VADDV(vaddvuh, 2, uint16_t)
1904 DO_VADDV(vaddvuw, 4, uint32_t)
1905 
1906 /*
1907  * Vector max/min across vector. Unlike VADDV, we must
1908  * read ra as the element size, not its full width.
1909  * We work with int64_t internally for simplicity.
1910  */
1911 #define DO_VMAXMINV(OP, ESIZE, TYPE, RATYPE, FN)                \
1912     uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
1913                                     uint32_t ra_in)             \
1914     {                                                           \
1915         uint16_t mask = mve_element_mask(env);                  \
1916         unsigned e;                                             \
1917         TYPE *m = vm;                                           \
1918         int64_t ra = (RATYPE)ra_in;                             \
1919         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
1920             if (mask & 1) {                                     \
1921                 ra = FN(ra, m[H##ESIZE(e)]);                    \
1922             }                                                   \
1923         }                                                       \
1924         mve_advance_vpt(env);                                   \
1925         return ra;                                              \
1926     }                                                           \
1927 
1928 #define DO_VMAXMINV_U(INSN, FN)                         \
1929     DO_VMAXMINV(INSN##b, 1, uint8_t, uint8_t, FN)       \
1930     DO_VMAXMINV(INSN##h, 2, uint16_t, uint16_t, FN)     \
1931     DO_VMAXMINV(INSN##w, 4, uint32_t, uint32_t, FN)
1932 #define DO_VMAXMINV_S(INSN, FN)                         \
1933     DO_VMAXMINV(INSN##b, 1, int8_t, int8_t, FN)         \
1934     DO_VMAXMINV(INSN##h, 2, int16_t, int16_t, FN)       \
1935     DO_VMAXMINV(INSN##w, 4, int32_t, int32_t, FN)
1936 
1937 /*
1938  * Helpers for max and min of absolute values across vector:
1939  * note that we only take the absolute value of 'm', not 'n'
1940  */
1941 static int64_t do_maxa(int64_t n, int64_t m)
1942 {
1943     if (m < 0) {
1944         m = -m;
1945     }
1946     return MAX(n, m);
1947 }
1948 
1949 static int64_t do_mina(int64_t n, int64_t m)
1950 {
1951     if (m < 0) {
1952         m = -m;
1953     }
1954     return MIN(n, m);
1955 }
1956 
1957 DO_VMAXMINV_S(vmaxvs, DO_MAX)
1958 DO_VMAXMINV_U(vmaxvu, DO_MAX)
1959 DO_VMAXMINV_S(vminvs, DO_MIN)
1960 DO_VMAXMINV_U(vminvu, DO_MIN)
1961 /*
1962  * VMAXAV, VMINAV treat the general purpose input as unsigned
1963  * and the vector elements as signed.
1964  */
1965 DO_VMAXMINV(vmaxavb, 1, int8_t, uint8_t, do_maxa)
1966 DO_VMAXMINV(vmaxavh, 2, int16_t, uint16_t, do_maxa)
1967 DO_VMAXMINV(vmaxavw, 4, int32_t, uint32_t, do_maxa)
1968 DO_VMAXMINV(vminavb, 1, int8_t, uint8_t, do_mina)
1969 DO_VMAXMINV(vminavh, 2, int16_t, uint16_t, do_mina)
1970 DO_VMAXMINV(vminavw, 4, int32_t, uint32_t, do_mina)
1971 
1972 #define DO_VABAV(OP, ESIZE, TYPE)                               \
1973     uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
1974                                     void *vm, uint32_t ra)      \
1975     {                                                           \
1976         uint16_t mask = mve_element_mask(env);                  \
1977         unsigned e;                                             \
1978         TYPE *m = vm, *n = vn;                                  \
1979         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
1980             if (mask & 1) {                                     \
1981                 int64_t n0 = n[H##ESIZE(e)];                    \
1982                 int64_t m0 = m[H##ESIZE(e)];                    \
1983                 uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0);  \
1984                 ra += r;                                        \
1985             }                                                   \
1986         }                                                       \
1987         mve_advance_vpt(env);                                   \
1988         return ra;                                              \
1989     }
1990 
1991 DO_VABAV(vabavsb, 1, int8_t)
1992 DO_VABAV(vabavsh, 2, int16_t)
1993 DO_VABAV(vabavsw, 4, int32_t)
1994 DO_VABAV(vabavub, 1, uint8_t)
1995 DO_VABAV(vabavuh, 2, uint16_t)
1996 DO_VABAV(vabavuw, 4, uint32_t)
1997 
1998 #define DO_VADDLV(OP, TYPE, LTYPE)                              \
1999     uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
2000                                     uint64_t ra)                \
2001     {                                                           \
2002         uint16_t mask = mve_element_mask(env);                  \
2003         unsigned e;                                             \
2004         TYPE *m = vm;                                           \
2005         for (e = 0; e < 16 / 4; e++, mask >>= 4) {              \
2006             if (mask & 1) {                                     \
2007                 ra += (LTYPE)m[H4(e)];                          \
2008             }                                                   \
2009         }                                                       \
2010         mve_advance_vpt(env);                                   \
2011         return ra;                                              \
2012     }                                                           \
2013 
2014 DO_VADDLV(vaddlv_s, int32_t, int64_t)
2015 DO_VADDLV(vaddlv_u, uint32_t, uint64_t)
2016 
2017 /* Shifts by immediate */
2018 #define DO_2SHIFT(OP, ESIZE, TYPE, FN)                          \
2019     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
2020                                 void *vm, uint32_t shift)       \
2021     {                                                           \
2022         TYPE *d = vd, *m = vm;                                  \
2023         uint16_t mask = mve_element_mask(env);                  \
2024         unsigned e;                                             \
2025         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
2026             mergemask(&d[H##ESIZE(e)],                          \
2027                       FN(m[H##ESIZE(e)], shift), mask);         \
2028         }                                                       \
2029         mve_advance_vpt(env);                                   \
2030     }
2031 
2032 #define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN)                      \
2033     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
2034                                 void *vm, uint32_t shift)       \
2035     {                                                           \
2036         TYPE *d = vd, *m = vm;                                  \
2037         uint16_t mask = mve_element_mask(env);                  \
2038         unsigned e;                                             \
2039         bool qc = false;                                        \
2040         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
2041             bool sat = false;                                   \
2042             mergemask(&d[H##ESIZE(e)],                          \
2043                       FN(m[H##ESIZE(e)], shift, &sat), mask);   \
2044             qc |= sat & mask & 1;                               \
2045         }                                                       \
2046         if (qc) {                                               \
2047             env->vfp.qc[0] = qc;                                \
2048         }                                                       \
2049         mve_advance_vpt(env);                                   \
2050     }
2051 
2052 /* provide unsigned 2-op shift helpers for all sizes */
2053 #define DO_2SHIFT_U(OP, FN)                     \
2054     DO_2SHIFT(OP##b, 1, uint8_t, FN)            \
2055     DO_2SHIFT(OP##h, 2, uint16_t, FN)           \
2056     DO_2SHIFT(OP##w, 4, uint32_t, FN)
2057 #define DO_2SHIFT_S(OP, FN)                     \
2058     DO_2SHIFT(OP##b, 1, int8_t, FN)             \
2059     DO_2SHIFT(OP##h, 2, int16_t, FN)            \
2060     DO_2SHIFT(OP##w, 4, int32_t, FN)
2061 
2062 #define DO_2SHIFT_SAT_U(OP, FN)                 \
2063     DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN)        \
2064     DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN)       \
2065     DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN)
2066 #define DO_2SHIFT_SAT_S(OP, FN)                 \
2067     DO_2SHIFT_SAT(OP##b, 1, int8_t, FN)         \
2068     DO_2SHIFT_SAT(OP##h, 2, int16_t, FN)        \
2069     DO_2SHIFT_SAT(OP##w, 4, int32_t, FN)
2070 
2071 DO_2SHIFT_U(vshli_u, DO_VSHLU)
2072 DO_2SHIFT_S(vshli_s, DO_VSHLS)
2073 DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP)
2074 DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
2075 DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
2076 DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
2077 DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
2078 DO_2SHIFT_SAT_U(vqrshli_u, DO_UQRSHL_OP)
2079 DO_2SHIFT_SAT_S(vqrshli_s, DO_SQRSHL_OP)
2080 
2081 /* Shift-and-insert; we always work with 64 bits at a time */
2082 #define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN)                    \
2083     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,             \
2084                                 void *vm, uint32_t shift)               \
2085     {                                                                   \
2086         uint64_t *d = vd, *m = vm;                                      \
2087         uint16_t mask;                                                  \
2088         uint64_t shiftmask;                                             \
2089         unsigned e;                                                     \
2090         if (shift == ESIZE * 8) {                                       \
2091             /*                                                          \
2092              * Only VSRI can shift by <dt>; it should mean "don't       \
2093              * update the destination". The generic logic can't handle  \
2094              * this because it would try to shift by an out-of-range    \
2095              * amount, so special case it here.                         \
2096              */                                                         \
2097             goto done;                                                  \
2098         }                                                               \
2099         assert(shift < ESIZE * 8);                                      \
2100         mask = mve_element_mask(env);                                   \
2101         /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */     \
2102         shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift));     \
2103         for (e = 0; e < 16 / 8; e++, mask >>= 8) {                      \
2104             uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) |       \
2105                 (d[H8(e)] & ~shiftmask);                                \
2106             mergemask(&d[H8(e)], r, mask);                              \
2107         }                                                               \
2108 done:                                                                   \
2109         mve_advance_vpt(env);                                           \
2110     }
2111 
2112 #define DO_SHL(N, SHIFT) ((N) << (SHIFT))
2113 #define DO_SHR(N, SHIFT) ((N) >> (SHIFT))
2114 #define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
2115 #define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
2116 
2117 DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK)
2118 DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK)
2119 DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK)
2120 DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK)
2121 DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK)
2122 DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK)
2123 
2124 /*
2125  * Long shifts taking half-sized inputs from top or bottom of the input
2126  * vector and producing a double-width result. ESIZE, TYPE are for
2127  * the input, and LESIZE, LTYPE for the output.
2128  * Unlike the normal shift helpers, we do not handle negative shift counts,
2129  * because the long shift is strictly left-only.
2130  */
2131 #define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE)                   \
2132     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,             \
2133                                 void *vm, uint32_t shift)               \
2134     {                                                                   \
2135         LTYPE *d = vd;                                                  \
2136         TYPE *m = vm;                                                   \
2137         uint16_t mask = mve_element_mask(env);                          \
2138         unsigned le;                                                    \
2139         assert(shift <= 16);                                            \
2140         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
2141             LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift;        \
2142             mergemask(&d[H##LESIZE(le)], r, mask);                      \
2143         }                                                               \
2144         mve_advance_vpt(env);                                           \
2145     }
2146 
2147 #define DO_VSHLL_ALL(OP, TOP)                                \
2148     DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t)             \
2149     DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t)           \
2150     DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t)            \
2151     DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t)          \
2152 
2153 DO_VSHLL_ALL(vshllb, false)
2154 DO_VSHLL_ALL(vshllt, true)
2155 
2156 /*
2157  * Narrowing right shifts, taking a double sized input, shifting it
2158  * and putting the result in either the top or bottom half of the output.
2159  * ESIZE, TYPE are the output, and LESIZE, LTYPE the input.
2160  */
2161 #define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)       \
2162     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
2163                                 void *vm, uint32_t shift)       \
2164     {                                                           \
2165         LTYPE *m = vm;                                          \
2166         TYPE *d = vd;                                           \
2167         uint16_t mask = mve_element_mask(env);                  \
2168         unsigned le;                                            \
2169         mask >>= ESIZE * TOP;                                   \
2170         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
2171             TYPE r = FN(m[H##LESIZE(le)], shift);               \
2172             mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);     \
2173         }                                                       \
2174         mve_advance_vpt(env);                                   \
2175     }
2176 
2177 #define DO_VSHRN_ALL(OP, FN)                                    \
2178     DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN)        \
2179     DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN)       \
2180     DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN)         \
2181     DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN)
2182 
2183 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2184 {
2185     if (likely(sh < 64)) {
2186         return (x >> sh) + ((x >> (sh - 1)) & 1);
2187     } else if (sh == 64) {
2188         return x >> 63;
2189     } else {
2190         return 0;
2191     }
2192 }
2193 
2194 static inline int64_t do_srshr(int64_t x, unsigned sh)
2195 {
2196     if (likely(sh < 64)) {
2197         return (x >> sh) + ((x >> (sh - 1)) & 1);
2198     } else {
2199         /* Rounding the sign bit always produces 0. */
2200         return 0;
2201     }
2202 }
2203 
2204 DO_VSHRN_ALL(vshrn, DO_SHR)
2205 DO_VSHRN_ALL(vrshrn, do_urshr)
2206 
2207 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max,
2208                                  bool *satp)
2209 {
2210     if (val > max) {
2211         *satp = true;
2212         return max;
2213     } else if (val < min) {
2214         *satp = true;
2215         return min;
2216     } else {
2217         return val;
2218     }
2219 }
2220 
2221 /* Saturating narrowing right shifts */
2222 #define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)   \
2223     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
2224                                 void *vm, uint32_t shift)       \
2225     {                                                           \
2226         LTYPE *m = vm;                                          \
2227         TYPE *d = vd;                                           \
2228         uint16_t mask = mve_element_mask(env);                  \
2229         bool qc = false;                                        \
2230         unsigned le;                                            \
2231         mask >>= ESIZE * TOP;                                   \
2232         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
2233             bool sat = false;                                   \
2234             TYPE r = FN(m[H##LESIZE(le)], shift, &sat);         \
2235             mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);     \
2236             qc |= sat & mask & 1;                               \
2237         }                                                       \
2238         if (qc) {                                               \
2239             env->vfp.qc[0] = qc;                                \
2240         }                                                       \
2241         mve_advance_vpt(env);                                   \
2242     }
2243 
2244 #define DO_VSHRN_SAT_UB(BOP, TOP, FN)                           \
2245     DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN)       \
2246     DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
2247 
2248 #define DO_VSHRN_SAT_UH(BOP, TOP, FN)                           \
2249     DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN)      \
2250     DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
2251 
2252 #define DO_VSHRN_SAT_SB(BOP, TOP, FN)                           \
2253     DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN)         \
2254     DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
2255 
2256 #define DO_VSHRN_SAT_SH(BOP, TOP, FN)                           \
2257     DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN)        \
2258     DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
2259 
2260 #define DO_SHRN_SB(N, M, SATP)                                  \
2261     do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP)
2262 #define DO_SHRN_UB(N, M, SATP)                                  \
2263     do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP)
2264 #define DO_SHRUN_B(N, M, SATP)                                  \
2265     do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP)
2266 
2267 #define DO_SHRN_SH(N, M, SATP)                                  \
2268     do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP)
2269 #define DO_SHRN_UH(N, M, SATP)                                  \
2270     do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP)
2271 #define DO_SHRUN_H(N, M, SATP)                                  \
2272     do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP)
2273 
2274 #define DO_RSHRN_SB(N, M, SATP)                                 \
2275     do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP)
2276 #define DO_RSHRN_UB(N, M, SATP)                                 \
2277     do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP)
2278 #define DO_RSHRUN_B(N, M, SATP)                                 \
2279     do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP)
2280 
2281 #define DO_RSHRN_SH(N, M, SATP)                                 \
2282     do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP)
2283 #define DO_RSHRN_UH(N, M, SATP)                                 \
2284     do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP)
2285 #define DO_RSHRUN_H(N, M, SATP)                                 \
2286     do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP)
2287 
2288 DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB)
2289 DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH)
2290 DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB)
2291 DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH)
2292 DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B)
2293 DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H)
2294 
2295 DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB)
2296 DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH)
2297 DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB)
2298 DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH)
2299 DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B)
2300 DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H)
2301 
2302 #define DO_VMOVN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE)                   \
2303     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
2304     {                                                                   \
2305         LTYPE *m = vm;                                                  \
2306         TYPE *d = vd;                                                   \
2307         uint16_t mask = mve_element_mask(env);                          \
2308         unsigned le;                                                    \
2309         mask >>= ESIZE * TOP;                                           \
2310         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
2311             mergemask(&d[H##ESIZE(le * 2 + TOP)],                       \
2312                       m[H##LESIZE(le)], mask);                          \
2313         }                                                               \
2314         mve_advance_vpt(env);                                           \
2315     }
2316 
2317 DO_VMOVN(vmovnbb, false, 1, uint8_t, 2, uint16_t)
2318 DO_VMOVN(vmovnbh, false, 2, uint16_t, 4, uint32_t)
2319 DO_VMOVN(vmovntb, true, 1, uint8_t, 2, uint16_t)
2320 DO_VMOVN(vmovnth, true, 2, uint16_t, 4, uint32_t)
2321 
2322 #define DO_VMOVN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)           \
2323     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
2324     {                                                                   \
2325         LTYPE *m = vm;                                                  \
2326         TYPE *d = vd;                                                   \
2327         uint16_t mask = mve_element_mask(env);                          \
2328         bool qc = false;                                                \
2329         unsigned le;                                                    \
2330         mask >>= ESIZE * TOP;                                           \
2331         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
2332             bool sat = false;                                           \
2333             TYPE r = FN(m[H##LESIZE(le)], &sat);                        \
2334             mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);             \
2335             qc |= sat & mask & 1;                                       \
2336         }                                                               \
2337         if (qc) {                                                       \
2338             env->vfp.qc[0] = qc;                                        \
2339         }                                                               \
2340         mve_advance_vpt(env);                                           \
2341     }
2342 
2343 #define DO_VMOVN_SAT_UB(BOP, TOP, FN)                           \
2344     DO_VMOVN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN)       \
2345     DO_VMOVN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
2346 
2347 #define DO_VMOVN_SAT_UH(BOP, TOP, FN)                           \
2348     DO_VMOVN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN)      \
2349     DO_VMOVN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
2350 
2351 #define DO_VMOVN_SAT_SB(BOP, TOP, FN)                           \
2352     DO_VMOVN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN)         \
2353     DO_VMOVN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
2354 
2355 #define DO_VMOVN_SAT_SH(BOP, TOP, FN)                           \
2356     DO_VMOVN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN)        \
2357     DO_VMOVN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
2358 
2359 #define DO_VQMOVN_SB(N, SATP)                           \
2360     do_sat_bhs((int64_t)(N), INT8_MIN, INT8_MAX, SATP)
2361 #define DO_VQMOVN_UB(N, SATP)                           \
2362     do_sat_bhs((uint64_t)(N), 0, UINT8_MAX, SATP)
2363 #define DO_VQMOVUN_B(N, SATP)                           \
2364     do_sat_bhs((int64_t)(N), 0, UINT8_MAX, SATP)
2365 
2366 #define DO_VQMOVN_SH(N, SATP)                           \
2367     do_sat_bhs((int64_t)(N), INT16_MIN, INT16_MAX, SATP)
2368 #define DO_VQMOVN_UH(N, SATP)                           \
2369     do_sat_bhs((uint64_t)(N), 0, UINT16_MAX, SATP)
2370 #define DO_VQMOVUN_H(N, SATP)                           \
2371     do_sat_bhs((int64_t)(N), 0, UINT16_MAX, SATP)
2372 
2373 DO_VMOVN_SAT_SB(vqmovnbsb, vqmovntsb, DO_VQMOVN_SB)
2374 DO_VMOVN_SAT_SH(vqmovnbsh, vqmovntsh, DO_VQMOVN_SH)
2375 DO_VMOVN_SAT_UB(vqmovnbub, vqmovntub, DO_VQMOVN_UB)
2376 DO_VMOVN_SAT_UH(vqmovnbuh, vqmovntuh, DO_VQMOVN_UH)
2377 DO_VMOVN_SAT_SB(vqmovunbb, vqmovuntb, DO_VQMOVUN_B)
2378 DO_VMOVN_SAT_SH(vqmovunbh, vqmovunth, DO_VQMOVUN_H)
2379 
2380 uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
2381                            uint32_t shift)
2382 {
2383     uint32_t *d = vd;
2384     uint16_t mask = mve_element_mask(env);
2385     unsigned e;
2386     uint32_t r;
2387 
2388     /*
2389      * For each 32-bit element, we shift it left, bringing in the
2390      * low 'shift' bits of rdm at the bottom. Bits shifted out at
2391      * the top become the new rdm, if the predicate mask permits.
2392      * The final rdm value is returned to update the register.
2393      * shift == 0 here means "shift by 32 bits".
2394      */
2395     if (shift == 0) {
2396         for (e = 0; e < 16 / 4; e++, mask >>= 4) {
2397             r = rdm;
2398             if (mask & 1) {
2399                 rdm = d[H4(e)];
2400             }
2401             mergemask(&d[H4(e)], r, mask);
2402         }
2403     } else {
2404         uint32_t shiftmask = MAKE_64BIT_MASK(0, shift);
2405 
2406         for (e = 0; e < 16 / 4; e++, mask >>= 4) {
2407             r = (d[H4(e)] << shift) | (rdm & shiftmask);
2408             if (mask & 1) {
2409                 rdm = d[H4(e)] >> (32 - shift);
2410             }
2411             mergemask(&d[H4(e)], r, mask);
2412         }
2413     }
2414     mve_advance_vpt(env);
2415     return rdm;
2416 }
2417 
2418 uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
2419 {
2420     return do_sqrshl_d(n, -(int8_t)shift, false, NULL);
2421 }
2422 
2423 uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift)
2424 {
2425     return do_uqrshl_d(n, (int8_t)shift, false, NULL);
2426 }
2427 
2428 uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
2429 {
2430     return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);
2431 }
2432 
2433 uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
2434 {
2435     return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);
2436 }
2437 
2438 uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
2439 {
2440     return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);
2441 }
2442 
2443 uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift)
2444 {
2445     return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);
2446 }
2447 
2448 /* Operate on 64-bit values, but saturate at 48 bits */
2449 static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift,
2450                                     bool round, uint32_t *sat)
2451 {
2452     int64_t val, extval;
2453 
2454     if (shift <= -48) {
2455         /* Rounding the sign bit always produces 0. */
2456         if (round) {
2457             return 0;
2458         }
2459         return src >> 63;
2460     } else if (shift < 0) {
2461         if (round) {
2462             src >>= -shift - 1;
2463             val = (src >> 1) + (src & 1);
2464         } else {
2465             val = src >> -shift;
2466         }
2467         extval = sextract64(val, 0, 48);
2468         if (!sat || val == extval) {
2469             return extval;
2470         }
2471     } else if (shift < 48) {
2472         extval = sextract64(src << shift, 0, 48);
2473         if (!sat || src == (extval >> shift)) {
2474             return extval;
2475         }
2476     } else if (!sat || src == 0) {
2477         return 0;
2478     }
2479 
2480     *sat = 1;
2481     return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17);
2482 }
2483 
2484 /* Operate on 64-bit values, but saturate at 48 bits */
2485 static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift,
2486                                      bool round, uint32_t *sat)
2487 {
2488     uint64_t val, extval;
2489 
2490     if (shift <= -(48 + round)) {
2491         return 0;
2492     } else if (shift < 0) {
2493         if (round) {
2494             val = src >> (-shift - 1);
2495             val = (val >> 1) + (val & 1);
2496         } else {
2497             val = src >> -shift;
2498         }
2499         extval = extract64(val, 0, 48);
2500         if (!sat || val == extval) {
2501             return extval;
2502         }
2503     } else if (shift < 48) {
2504         extval = extract64(src << shift, 0, 48);
2505         if (!sat || src == (extval >> shift)) {
2506             return extval;
2507         }
2508     } else if (!sat || src == 0) {
2509         return 0;
2510     }
2511 
2512     *sat = 1;
2513     return MAKE_64BIT_MASK(0, 48);
2514 }
2515 
2516 uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift)
2517 {
2518     return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);
2519 }
2520 
2521 uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift)
2522 {
2523     return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);
2524 }
2525 
2526 uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
2527 {
2528     return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
2529 }
2530 
2531 uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
2532 {
2533     return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
2534 }
2535 
2536 uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift)
2537 {
2538     return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);
2539 }
2540 
2541 uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift)
2542 {
2543     return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);
2544 }
2545 
2546 #define DO_VIDUP(OP, ESIZE, TYPE, FN)                           \
2547     uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd,       \
2548                            uint32_t offset, uint32_t imm)       \
2549     {                                                           \
2550         TYPE *d = vd;                                           \
2551         uint16_t mask = mve_element_mask(env);                  \
2552         unsigned e;                                             \
2553         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
2554             mergemask(&d[H##ESIZE(e)], offset, mask);           \
2555             offset = FN(offset, imm);                           \
2556         }                                                       \
2557         mve_advance_vpt(env);                                   \
2558         return offset;                                          \
2559     }
2560 
2561 #define DO_VIWDUP(OP, ESIZE, TYPE, FN)                          \
2562     uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd,       \
2563                               uint32_t offset, uint32_t wrap,   \
2564                               uint32_t imm)                     \
2565     {                                                           \
2566         TYPE *d = vd;                                           \
2567         uint16_t mask = mve_element_mask(env);                  \
2568         unsigned e;                                             \
2569         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
2570             mergemask(&d[H##ESIZE(e)], offset, mask);           \
2571             offset = FN(offset, wrap, imm);                     \
2572         }                                                       \
2573         mve_advance_vpt(env);                                   \
2574         return offset;                                          \
2575     }
2576 
2577 #define DO_VIDUP_ALL(OP, FN)                    \
2578     DO_VIDUP(OP##b, 1, int8_t, FN)              \
2579     DO_VIDUP(OP##h, 2, int16_t, FN)             \
2580     DO_VIDUP(OP##w, 4, int32_t, FN)
2581 
2582 #define DO_VIWDUP_ALL(OP, FN)                   \
2583     DO_VIWDUP(OP##b, 1, int8_t, FN)             \
2584     DO_VIWDUP(OP##h, 2, int16_t, FN)            \
2585     DO_VIWDUP(OP##w, 4, int32_t, FN)
2586 
2587 static uint32_t do_add_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
2588 {
2589     offset += imm;
2590     if (offset == wrap) {
2591         offset = 0;
2592     }
2593     return offset;
2594 }
2595 
2596 static uint32_t do_sub_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
2597 {
2598     if (offset == 0) {
2599         offset = wrap;
2600     }
2601     offset -= imm;
2602     return offset;
2603 }
2604 
2605 DO_VIDUP_ALL(vidup, DO_ADD)
2606 DO_VIWDUP_ALL(viwdup, do_add_wrap)
2607 DO_VIWDUP_ALL(vdwdup, do_sub_wrap)
2608 
2609 /*
2610  * Vector comparison.
2611  * P0 bits for non-executed beats (where eci_mask is 0) are unchanged.
2612  * P0 bits for predicated lanes in executed beats (where mask is 0) are 0.
2613  * P0 bits otherwise are updated with the results of the comparisons.
2614  * We must also keep unchanged the MASK fields at the top of v7m.vpr.
2615  */
2616 #define DO_VCMP(OP, ESIZE, TYPE, FN)                                    \
2617     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm)   \
2618     {                                                                   \
2619         TYPE *n = vn, *m = vm;                                          \
2620         uint16_t mask = mve_element_mask(env);                          \
2621         uint16_t eci_mask = mve_eci_mask(env);                          \
2622         uint16_t beatpred = 0;                                          \
2623         uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
2624         unsigned e;                                                     \
2625         for (e = 0; e < 16 / ESIZE; e++) {                              \
2626             bool r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)]);                \
2627             /* Comparison sets 0/1 bits for each byte in the element */ \
2628             beatpred |= r * emask;                                      \
2629             emask <<= ESIZE;                                            \
2630         }                                                               \
2631         beatpred &= mask;                                               \
2632         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
2633             (beatpred & eci_mask);                                      \
2634         mve_advance_vpt(env);                                           \
2635     }
2636 
2637 #define DO_VCMP_SCALAR(OP, ESIZE, TYPE, FN)                             \
2638     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,             \
2639                                 uint32_t rm)                            \
2640     {                                                                   \
2641         TYPE *n = vn;                                                   \
2642         uint16_t mask = mve_element_mask(env);                          \
2643         uint16_t eci_mask = mve_eci_mask(env);                          \
2644         uint16_t beatpred = 0;                                          \
2645         uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
2646         unsigned e;                                                     \
2647         for (e = 0; e < 16 / ESIZE; e++) {                              \
2648             bool r = FN(n[H##ESIZE(e)], (TYPE)rm);                      \
2649             /* Comparison sets 0/1 bits for each byte in the element */ \
2650             beatpred |= r * emask;                                      \
2651             emask <<= ESIZE;                                            \
2652         }                                                               \
2653         beatpred &= mask;                                               \
2654         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
2655             (beatpred & eci_mask);                                      \
2656         mve_advance_vpt(env);                                           \
2657     }
2658 
2659 #define DO_VCMP_S(OP, FN)                               \
2660     DO_VCMP(OP##b, 1, int8_t, FN)                       \
2661     DO_VCMP(OP##h, 2, int16_t, FN)                      \
2662     DO_VCMP(OP##w, 4, int32_t, FN)                      \
2663     DO_VCMP_SCALAR(OP##_scalarb, 1, int8_t, FN)         \
2664     DO_VCMP_SCALAR(OP##_scalarh, 2, int16_t, FN)        \
2665     DO_VCMP_SCALAR(OP##_scalarw, 4, int32_t, FN)
2666 
2667 #define DO_VCMP_U(OP, FN)                               \
2668     DO_VCMP(OP##b, 1, uint8_t, FN)                      \
2669     DO_VCMP(OP##h, 2, uint16_t, FN)                     \
2670     DO_VCMP(OP##w, 4, uint32_t, FN)                     \
2671     DO_VCMP_SCALAR(OP##_scalarb, 1, uint8_t, FN)        \
2672     DO_VCMP_SCALAR(OP##_scalarh, 2, uint16_t, FN)       \
2673     DO_VCMP_SCALAR(OP##_scalarw, 4, uint32_t, FN)
2674 
2675 #define DO_EQ(N, M) ((N) == (M))
2676 #define DO_NE(N, M) ((N) != (M))
2677 #define DO_EQ(N, M) ((N) == (M))
2678 #define DO_EQ(N, M) ((N) == (M))
2679 #define DO_GE(N, M) ((N) >= (M))
2680 #define DO_LT(N, M) ((N) < (M))
2681 #define DO_GT(N, M) ((N) > (M))
2682 #define DO_LE(N, M) ((N) <= (M))
2683 
2684 DO_VCMP_U(vcmpeq, DO_EQ)
2685 DO_VCMP_U(vcmpne, DO_NE)
2686 DO_VCMP_U(vcmpcs, DO_GE)
2687 DO_VCMP_U(vcmphi, DO_GT)
2688 DO_VCMP_S(vcmpge, DO_GE)
2689 DO_VCMP_S(vcmplt, DO_LT)
2690 DO_VCMP_S(vcmpgt, DO_GT)
2691 DO_VCMP_S(vcmple, DO_LE)
2692 
2693 void HELPER(mve_vpsel)(CPUARMState *env, void *vd, void *vn, void *vm)
2694 {
2695     /*
2696      * Qd[n] = VPR.P0[n] ? Qn[n] : Qm[n]
2697      * but note that whether bytes are written to Qd is still subject
2698      * to (all forms of) predication in the usual way.
2699      */
2700     uint64_t *d = vd, *n = vn, *m = vm;
2701     uint16_t mask = mve_element_mask(env);
2702     uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
2703     unsigned e;
2704     for (e = 0; e < 16 / 8; e++, mask >>= 8, p0 >>= 8) {
2705         uint64_t r = m[H8(e)];
2706         mergemask(&r, n[H8(e)], p0);
2707         mergemask(&d[H8(e)], r, mask);
2708     }
2709     mve_advance_vpt(env);
2710 }
2711 
2712 void HELPER(mve_vpnot)(CPUARMState *env)
2713 {
2714     /*
2715      * P0 bits for unexecuted beats (where eci_mask is 0) are unchanged.
2716      * P0 bits for predicated lanes in executed bits (where mask is 0) are 0.
2717      * P0 bits otherwise are inverted.
2718      * (This is the same logic as VCMP.)
2719      * This insn is itself subject to predication and to beat-wise execution,
2720      * and after it executes VPT state advances in the usual way.
2721      */
2722     uint16_t mask = mve_element_mask(env);
2723     uint16_t eci_mask = mve_eci_mask(env);
2724     uint16_t beatpred = ~env->v7m.vpr & mask;
2725     env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask);
2726     mve_advance_vpt(env);
2727 }
2728 
2729 /*
2730  * VCTP: P0 unexecuted bits unchanged, predicated bits zeroed,
2731  * otherwise set according to value of Rn. The calculation of
2732  * newmask here works in the same way as the calculation of the
2733  * ltpmask in mve_element_mask(), but we have pre-calculated
2734  * the masklen in the generated code.
2735  */
2736 void HELPER(mve_vctp)(CPUARMState *env, uint32_t masklen)
2737 {
2738     uint16_t mask = mve_element_mask(env);
2739     uint16_t eci_mask = mve_eci_mask(env);
2740     uint16_t newmask;
2741 
2742     assert(masklen <= 16);
2743     newmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
2744     newmask &= mask;
2745     env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask);
2746     mve_advance_vpt(env);
2747 }
2748 
2749 #define DO_1OP_SAT(OP, ESIZE, TYPE, FN)                                 \
2750     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
2751     {                                                                   \
2752         TYPE *d = vd, *m = vm;                                          \
2753         uint16_t mask = mve_element_mask(env);                          \
2754         unsigned e;                                                     \
2755         bool qc = false;                                                \
2756         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
2757             bool sat = false;                                           \
2758             mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)], &sat), mask); \
2759             qc |= sat & mask & 1;                                       \
2760         }                                                               \
2761         if (qc) {                                                       \
2762             env->vfp.qc[0] = qc;                                        \
2763         }                                                               \
2764         mve_advance_vpt(env);                                           \
2765     }
2766 
2767 #define DO_VQABS_B(N, SATP) \
2768     do_sat_bhs(DO_ABS((int64_t)N), INT8_MIN, INT8_MAX, SATP)
2769 #define DO_VQABS_H(N, SATP) \
2770     do_sat_bhs(DO_ABS((int64_t)N), INT16_MIN, INT16_MAX, SATP)
2771 #define DO_VQABS_W(N, SATP) \
2772     do_sat_bhs(DO_ABS((int64_t)N), INT32_MIN, INT32_MAX, SATP)
2773 
2774 #define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP)
2775 #define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP)
2776 #define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP)
2777 
2778 DO_1OP_SAT(vqabsb, 1, int8_t, DO_VQABS_B)
2779 DO_1OP_SAT(vqabsh, 2, int16_t, DO_VQABS_H)
2780 DO_1OP_SAT(vqabsw, 4, int32_t, DO_VQABS_W)
2781 
2782 DO_1OP_SAT(vqnegb, 1, int8_t, DO_VQNEG_B)
2783 DO_1OP_SAT(vqnegh, 2, int16_t, DO_VQNEG_H)
2784 DO_1OP_SAT(vqnegw, 4, int32_t, DO_VQNEG_W)
2785 
2786 /*
2787  * VMAXA, VMINA: vd is unsigned; vm is signed, and we take its
2788  * absolute value; we then do an unsigned comparison.
2789  */
2790 #define DO_VMAXMINA(OP, ESIZE, STYPE, UTYPE, FN)                        \
2791     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
2792     {                                                                   \
2793         UTYPE *d = vd;                                                  \
2794         STYPE *m = vm;                                                  \
2795         uint16_t mask = mve_element_mask(env);                          \
2796         unsigned e;                                                     \
2797         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
2798             UTYPE r = DO_ABS(m[H##ESIZE(e)]);                           \
2799             r = FN(d[H##ESIZE(e)], r);                                  \
2800             mergemask(&d[H##ESIZE(e)], r, mask);                        \
2801         }                                                               \
2802         mve_advance_vpt(env);                                           \
2803     }
2804 
2805 DO_VMAXMINA(vmaxab, 1, int8_t, uint8_t, DO_MAX)
2806 DO_VMAXMINA(vmaxah, 2, int16_t, uint16_t, DO_MAX)
2807 DO_VMAXMINA(vmaxaw, 4, int32_t, uint32_t, DO_MAX)
2808 DO_VMAXMINA(vminab, 1, int8_t, uint8_t, DO_MIN)
2809 DO_VMAXMINA(vminah, 2, int16_t, uint16_t, DO_MIN)
2810 DO_VMAXMINA(vminaw, 4, int32_t, uint32_t, DO_MIN)
2811 
2812 /*
2813  * 2-operand floating point. Note that if an element is partially
2814  * predicated we must do the FP operation to update the non-predicated
2815  * bytes, but we must be careful to avoid updating the FP exception
2816  * state unless byte 0 of the element was unpredicated.
2817  */
2818 #define DO_2OP_FP(OP, ESIZE, TYPE, FN)                                  \
2819     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
2820                                 void *vd, void *vn, void *vm)           \
2821     {                                                                   \
2822         TYPE *d = vd, *n = vn, *m = vm;                                 \
2823         TYPE r;                                                         \
2824         uint16_t mask = mve_element_mask(env);                          \
2825         unsigned e;                                                     \
2826         float_status *fpst;                                             \
2827         float_status scratch_fpst;                                      \
2828         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
2829             if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
2830                 continue;                                               \
2831             }                                                           \
2832             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2833             if (!(mask & 1)) {                                          \
2834                 /* We need the result but without updating flags */     \
2835                 scratch_fpst = *fpst;                                   \
2836                 fpst = &scratch_fpst;                                   \
2837             }                                                           \
2838             r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst);               \
2839             mergemask(&d[H##ESIZE(e)], r, mask);                        \
2840         }                                                               \
2841         mve_advance_vpt(env);                                           \
2842     }
2843 
2844 #define DO_2OP_FP_ALL(OP, FN)                  \
2845     DO_2OP_FP(OP##h, 2, float16, float16_##FN) \
2846     DO_2OP_FP(OP##s, 4, float32, float32_##FN)
2847 
2848 DO_2OP_FP_ALL(vfadd, add)
2849 DO_2OP_FP_ALL(vfsub, sub)
2850 DO_2OP_FP_ALL(vfmul, mul)
2851 
2852 static inline float16 float16_abd(float16 a, float16 b, float_status *s)
2853 {
2854     return float16_abs(float16_sub(a, b, s));
2855 }
2856 
2857 static inline float32 float32_abd(float32 a, float32 b, float_status *s)
2858 {
2859     return float32_abs(float32_sub(a, b, s));
2860 }
2861 
2862 DO_2OP_FP_ALL(vfabd, abd)
2863 DO_2OP_FP_ALL(vmaxnm, maxnum)
2864 DO_2OP_FP_ALL(vminnm, minnum)
2865 
2866 static inline float16 float16_maxnuma(float16 a, float16 b, float_status *s)
2867 {
2868     return float16_maxnum(float16_abs(a), float16_abs(b), s);
2869 }
2870 
2871 static inline float32 float32_maxnuma(float32 a, float32 b, float_status *s)
2872 {
2873     return float32_maxnum(float32_abs(a), float32_abs(b), s);
2874 }
2875 
2876 static inline float16 float16_minnuma(float16 a, float16 b, float_status *s)
2877 {
2878     return float16_minnum(float16_abs(a), float16_abs(b), s);
2879 }
2880 
2881 static inline float32 float32_minnuma(float32 a, float32 b, float_status *s)
2882 {
2883     return float32_minnum(float32_abs(a), float32_abs(b), s);
2884 }
2885 
2886 DO_2OP_FP_ALL(vmaxnma, maxnuma)
2887 DO_2OP_FP_ALL(vminnma, minnuma)
2888 
2889 #define DO_VCADD_FP(OP, ESIZE, TYPE, FN0, FN1)                          \
2890     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
2891                                 void *vd, void *vn, void *vm)           \
2892     {                                                                   \
2893         TYPE *d = vd, *n = vn, *m = vm;                                 \
2894         TYPE r[16 / ESIZE];                                             \
2895         uint16_t tm, mask = mve_element_mask(env);                      \
2896         unsigned e;                                                     \
2897         float_status *fpst;                                             \
2898         float_status scratch_fpst;                                      \
2899         /* Calculate all results first to avoid overwriting inputs */   \
2900         for (e = 0, tm = mask; e < 16 / ESIZE; e++, tm >>= ESIZE) {     \
2901             if ((tm & MAKE_64BIT_MASK(0, ESIZE)) == 0) {                \
2902                 r[e] = 0;                                               \
2903                 continue;                                               \
2904             }                                                           \
2905             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2906             if (!(tm & 1)) {                                            \
2907                 /* We need the result but without updating flags */     \
2908                 scratch_fpst = *fpst;                                   \
2909                 fpst = &scratch_fpst;                                   \
2910             }                                                           \
2911             if (!(e & 1)) {                                             \
2912                 r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)], fpst);   \
2913             } else {                                                    \
2914                 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst);   \
2915             }                                                           \
2916         }                                                               \
2917         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
2918             mergemask(&d[H##ESIZE(e)], r[e], mask);                     \
2919         }                                                               \
2920         mve_advance_vpt(env);                                           \
2921     }
2922 
2923 DO_VCADD_FP(vfcadd90h, 2, float16, float16_sub, float16_add)
2924 DO_VCADD_FP(vfcadd90s, 4, float32, float32_sub, float32_add)
2925 DO_VCADD_FP(vfcadd270h, 2, float16, float16_add, float16_sub)
2926 DO_VCADD_FP(vfcadd270s, 4, float32, float32_add, float32_sub)
2927 
2928 #define DO_VFMA(OP, ESIZE, TYPE, CHS)                                   \
2929     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
2930                                 void *vd, void *vn, void *vm)           \
2931     {                                                                   \
2932         TYPE *d = vd, *n = vn, *m = vm;                                 \
2933         TYPE r;                                                         \
2934         uint16_t mask = mve_element_mask(env);                          \
2935         unsigned e;                                                     \
2936         float_status *fpst;                                             \
2937         float_status scratch_fpst;                                      \
2938         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
2939             if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
2940                 continue;                                               \
2941             }                                                           \
2942             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2943             if (!(mask & 1)) {                                          \
2944                 /* We need the result but without updating flags */     \
2945                 scratch_fpst = *fpst;                                   \
2946                 fpst = &scratch_fpst;                                   \
2947             }                                                           \
2948             r = n[H##ESIZE(e)];                                         \
2949             if (CHS) {                                                  \
2950                 r = TYPE##_chs(r);                                      \
2951             }                                                           \
2952             r = TYPE##_muladd(r, m[H##ESIZE(e)], d[H##ESIZE(e)],        \
2953                               0, fpst);                                 \
2954             mergemask(&d[H##ESIZE(e)], r, mask);                        \
2955         }                                                               \
2956         mve_advance_vpt(env);                                           \
2957     }
2958 
2959 DO_VFMA(vfmah, 2, float16, false)
2960 DO_VFMA(vfmas, 4, float32, false)
2961 DO_VFMA(vfmsh, 2, float16, true)
2962 DO_VFMA(vfmss, 4, float32, true)
2963 
2964 #define DO_VCMLA(OP, ESIZE, TYPE, ROT, FN)                              \
2965     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
2966                                 void *vd, void *vn, void *vm)           \
2967     {                                                                   \
2968         TYPE *d = vd, *n = vn, *m = vm;                                 \
2969         TYPE r0, r1, e1, e2, e3, e4;                                    \
2970         uint16_t mask = mve_element_mask(env);                          \
2971         unsigned e;                                                     \
2972         float_status *fpst0, *fpst1;                                    \
2973         float_status scratch_fpst;                                      \
2974         /* We loop through pairs of elements at a time */               \
2975         for (e = 0; e < 16 / ESIZE; e += 2, mask >>= ESIZE * 2) {       \
2976             if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) {          \
2977                 continue;                                               \
2978             }                                                           \
2979             fpst0 = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
2980             fpst1 = fpst0;                                              \
2981             if (!(mask & 1)) {                                          \
2982                 scratch_fpst = *fpst0;                                  \
2983                 fpst0 = &scratch_fpst;                                  \
2984             }                                                           \
2985             if (!(mask & (1 << ESIZE))) {                               \
2986                 scratch_fpst = *fpst1;                                  \
2987                 fpst1 = &scratch_fpst;                                  \
2988             }                                                           \
2989             switch (ROT) {                                              \
2990             case 0:                                                     \
2991                 e1 = m[H##ESIZE(e)];                                    \
2992                 e2 = n[H##ESIZE(e)];                                    \
2993                 e3 = m[H##ESIZE(e + 1)];                                \
2994                 e4 = n[H##ESIZE(e)];                                    \
2995                 break;                                                  \
2996             case 1:                                                     \
2997                 e1 = TYPE##_chs(m[H##ESIZE(e + 1)]);                    \
2998                 e2 = n[H##ESIZE(e + 1)];                                \
2999                 e3 = m[H##ESIZE(e)];                                    \
3000                 e4 = n[H##ESIZE(e + 1)];                                \
3001                 break;                                                  \
3002             case 2:                                                     \
3003                 e1 = TYPE##_chs(m[H##ESIZE(e)]);                        \
3004                 e2 = n[H##ESIZE(e)];                                    \
3005                 e3 = TYPE##_chs(m[H##ESIZE(e + 1)]);                    \
3006                 e4 = n[H##ESIZE(e)];                                    \
3007                 break;                                                  \
3008             case 3:                                                     \
3009                 e1 = m[H##ESIZE(e + 1)];                                \
3010                 e2 = n[H##ESIZE(e + 1)];                                \
3011                 e3 = TYPE##_chs(m[H##ESIZE(e)]);                        \
3012                 e4 = n[H##ESIZE(e + 1)];                                \
3013                 break;                                                  \
3014             default:                                                    \
3015                 g_assert_not_reached();                                 \
3016             }                                                           \
3017             r0 = FN(e2, e1, d[H##ESIZE(e)], fpst0);                     \
3018             r1 = FN(e4, e3, d[H##ESIZE(e + 1)], fpst1);                 \
3019             mergemask(&d[H##ESIZE(e)], r0, mask);                       \
3020             mergemask(&d[H##ESIZE(e + 1)], r1, mask >> ESIZE);          \
3021         }                                                               \
3022         mve_advance_vpt(env);                                           \
3023     }
3024 
3025 #define DO_VCMULH(N, M, D, S) float16_mul(N, M, S)
3026 #define DO_VCMULS(N, M, D, S) float32_mul(N, M, S)
3027 
3028 #define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S)
3029 #define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S)
3030 
3031 DO_VCMLA(vcmul0h, 2, float16, 0, DO_VCMULH)
3032 DO_VCMLA(vcmul0s, 4, float32, 0, DO_VCMULS)
3033 DO_VCMLA(vcmul90h, 2, float16, 1, DO_VCMULH)
3034 DO_VCMLA(vcmul90s, 4, float32, 1, DO_VCMULS)
3035 DO_VCMLA(vcmul180h, 2, float16, 2, DO_VCMULH)
3036 DO_VCMLA(vcmul180s, 4, float32, 2, DO_VCMULS)
3037 DO_VCMLA(vcmul270h, 2, float16, 3, DO_VCMULH)
3038 DO_VCMLA(vcmul270s, 4, float32, 3, DO_VCMULS)
3039 
3040 DO_VCMLA(vcmla0h, 2, float16, 0, DO_VCMLAH)
3041 DO_VCMLA(vcmla0s, 4, float32, 0, DO_VCMLAS)
3042 DO_VCMLA(vcmla90h, 2, float16, 1, DO_VCMLAH)
3043 DO_VCMLA(vcmla90s, 4, float32, 1, DO_VCMLAS)
3044 DO_VCMLA(vcmla180h, 2, float16, 2, DO_VCMLAH)
3045 DO_VCMLA(vcmla180s, 4, float32, 2, DO_VCMLAS)
3046 DO_VCMLA(vcmla270h, 2, float16, 3, DO_VCMLAH)
3047 DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS)
3048 
3049 #define DO_2OP_FP_SCALAR(OP, ESIZE, TYPE, FN)                           \
3050     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
3051                                 void *vd, void *vn, uint32_t rm)        \
3052     {                                                                   \
3053         TYPE *d = vd, *n = vn;                                          \
3054         TYPE r, m = rm;                                                 \
3055         uint16_t mask = mve_element_mask(env);                          \
3056         unsigned e;                                                     \
3057         float_status *fpst;                                             \
3058         float_status scratch_fpst;                                      \
3059         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
3060             if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
3061                 continue;                                               \
3062             }                                                           \
3063             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3064             if (!(mask & 1)) {                                          \
3065                 /* We need the result but without updating flags */     \
3066                 scratch_fpst = *fpst;                                   \
3067                 fpst = &scratch_fpst;                                   \
3068             }                                                           \
3069             r = FN(n[H##ESIZE(e)], m, fpst);                            \
3070             mergemask(&d[H##ESIZE(e)], r, mask);                        \
3071         }                                                               \
3072         mve_advance_vpt(env);                                           \
3073     }
3074 
3075 #define DO_2OP_FP_SCALAR_ALL(OP, FN)                    \
3076     DO_2OP_FP_SCALAR(OP##h, 2, float16, float16_##FN)   \
3077     DO_2OP_FP_SCALAR(OP##s, 4, float32, float32_##FN)
3078 
3079 DO_2OP_FP_SCALAR_ALL(vfadd_scalar, add)
3080 DO_2OP_FP_SCALAR_ALL(vfsub_scalar, sub)
3081 DO_2OP_FP_SCALAR_ALL(vfmul_scalar, mul)
3082 
3083 #define DO_2OP_FP_ACC_SCALAR(OP, ESIZE, TYPE, FN)                       \
3084     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
3085                                 void *vd, void *vn, uint32_t rm)        \
3086     {                                                                   \
3087         TYPE *d = vd, *n = vn;                                          \
3088         TYPE r, m = rm;                                                 \
3089         uint16_t mask = mve_element_mask(env);                          \
3090         unsigned e;                                                     \
3091         float_status *fpst;                                             \
3092         float_status scratch_fpst;                                      \
3093         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
3094             if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
3095                 continue;                                               \
3096             }                                                           \
3097             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3098             if (!(mask & 1)) {                                          \
3099                 /* We need the result but without updating flags */     \
3100                 scratch_fpst = *fpst;                                   \
3101                 fpst = &scratch_fpst;                                   \
3102             }                                                           \
3103             r = FN(n[H##ESIZE(e)], m, d[H##ESIZE(e)], 0, fpst);         \
3104             mergemask(&d[H##ESIZE(e)], r, mask);                        \
3105         }                                                               \
3106         mve_advance_vpt(env);                                           \
3107     }
3108 
3109 /* VFMAS is vector * vector + scalar, so swap op2 and op3 */
3110 #define DO_VFMAS_SCALARH(N, M, D, F, S) float16_muladd(N, D, M, F, S)
3111 #define DO_VFMAS_SCALARS(N, M, D, F, S) float32_muladd(N, D, M, F, S)
3112 
3113 /* VFMA is vector * scalar + vector */
3114 DO_2OP_FP_ACC_SCALAR(vfma_scalarh, 2, float16, float16_muladd)
3115 DO_2OP_FP_ACC_SCALAR(vfma_scalars, 4, float32, float32_muladd)
3116 DO_2OP_FP_ACC_SCALAR(vfmas_scalarh, 2, float16, DO_VFMAS_SCALARH)
3117 DO_2OP_FP_ACC_SCALAR(vfmas_scalars, 4, float32, DO_VFMAS_SCALARS)
3118 
3119 /* Floating point max/min across vector. */
3120 #define DO_FP_VMAXMINV(OP, ESIZE, TYPE, ABS, FN)                \
3121     uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
3122                                     uint32_t ra_in)             \
3123     {                                                           \
3124         uint16_t mask = mve_element_mask(env);                  \
3125         unsigned e;                                             \
3126         TYPE *m = vm;                                           \
3127         TYPE ra = (TYPE)ra_in;                                  \
3128         float_status *fpst =                                    \
3129             &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3130         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
3131             if (mask & 1) {                                     \
3132                 TYPE v = m[H##ESIZE(e)];                        \
3133                 if (TYPE##_is_signaling_nan(ra, fpst)) {        \
3134                     ra = TYPE##_silence_nan(ra, fpst);          \
3135                     float_raise(float_flag_invalid, fpst);      \
3136                 }                                               \
3137                 if (TYPE##_is_signaling_nan(v, fpst)) {         \
3138                     v = TYPE##_silence_nan(v, fpst);            \
3139                     float_raise(float_flag_invalid, fpst);      \
3140                 }                                               \
3141                 if (ABS) {                                      \
3142                     v = TYPE##_abs(v);                          \
3143                 }                                               \
3144                 ra = FN(ra, v, fpst);                           \
3145             }                                                   \
3146         }                                                       \
3147         mve_advance_vpt(env);                                   \
3148         return ra;                                              \
3149     }                                                           \
3150 
3151 #define NOP(X) (X)
3152 
3153 DO_FP_VMAXMINV(vmaxnmvh, 2, float16, false, float16_maxnum)
3154 DO_FP_VMAXMINV(vmaxnmvs, 4, float32, false, float32_maxnum)
3155 DO_FP_VMAXMINV(vminnmvh, 2, float16, false, float16_minnum)
3156 DO_FP_VMAXMINV(vminnmvs, 4, float32, false, float32_minnum)
3157 DO_FP_VMAXMINV(vmaxnmavh, 2, float16, true, float16_maxnum)
3158 DO_FP_VMAXMINV(vmaxnmavs, 4, float32, true, float32_maxnum)
3159 DO_FP_VMAXMINV(vminnmavh, 2, float16, true, float16_minnum)
3160 DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum)
3161 
3162 /* FP compares; note that all comparisons signal InvalidOp for QNaNs */
3163 #define DO_VCMP_FP(OP, ESIZE, TYPE, FN)                                 \
3164     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm)   \
3165     {                                                                   \
3166         TYPE *n = vn, *m = vm;                                          \
3167         uint16_t mask = mve_element_mask(env);                          \
3168         uint16_t eci_mask = mve_eci_mask(env);                          \
3169         uint16_t beatpred = 0;                                          \
3170         uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
3171         unsigned e;                                                     \
3172         float_status *fpst;                                             \
3173         float_status scratch_fpst;                                      \
3174         bool r;                                                         \
3175         for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) {             \
3176             if ((mask & emask) == 0) {                                  \
3177                 continue;                                               \
3178             }                                                           \
3179             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3180             if (!(mask & (1 << (e * ESIZE)))) {                         \
3181                 /* We need the result but without updating flags */     \
3182                 scratch_fpst = *fpst;                                   \
3183                 fpst = &scratch_fpst;                                   \
3184             }                                                           \
3185             r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst);               \
3186             /* Comparison sets 0/1 bits for each byte in the element */ \
3187             beatpred |= r * emask;                                      \
3188         }                                                               \
3189         beatpred &= mask;                                               \
3190         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
3191             (beatpred & eci_mask);                                      \
3192         mve_advance_vpt(env);                                           \
3193     }
3194 
3195 #define DO_VCMP_FP_SCALAR(OP, ESIZE, TYPE, FN)                          \
3196     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,             \
3197                                 uint32_t rm)                            \
3198     {                                                                   \
3199         TYPE *n = vn;                                                   \
3200         uint16_t mask = mve_element_mask(env);                          \
3201         uint16_t eci_mask = mve_eci_mask(env);                          \
3202         uint16_t beatpred = 0;                                          \
3203         uint16_t emask = MAKE_64BIT_MASK(0, ESIZE);                     \
3204         unsigned e;                                                     \
3205         float_status *fpst;                                             \
3206         float_status scratch_fpst;                                      \
3207         bool r;                                                         \
3208         for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) {             \
3209             if ((mask & emask) == 0) {                                  \
3210                 continue;                                               \
3211             }                                                           \
3212             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3213             if (!(mask & (1 << (e * ESIZE)))) {                         \
3214                 /* We need the result but without updating flags */     \
3215                 scratch_fpst = *fpst;                                   \
3216                 fpst = &scratch_fpst;                                   \
3217             }                                                           \
3218             r = FN(n[H##ESIZE(e)], (TYPE)rm, fpst);                     \
3219             /* Comparison sets 0/1 bits for each byte in the element */ \
3220             beatpred |= r * emask;                                      \
3221         }                                                               \
3222         beatpred &= mask;                                               \
3223         env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) |           \
3224             (beatpred & eci_mask);                                      \
3225         mve_advance_vpt(env);                                           \
3226     }
3227 
3228 #define DO_VCMP_FP_BOTH(VOP, SOP, ESIZE, TYPE, FN)      \
3229     DO_VCMP_FP(VOP, ESIZE, TYPE, FN)                    \
3230     DO_VCMP_FP_SCALAR(SOP, ESIZE, TYPE, FN)
3231 
3232 /*
3233  * Some care is needed here to get the correct result for the unordered case.
3234  * Architecturally EQ, GE and GT are defined to be false for unordered, but
3235  * the NE, LT and LE comparisons are defined as simple logical inverses of
3236  * EQ, GE and GT and so they must return true for unordered. The softfloat
3237  * comparison functions float*_{eq,le,lt} all return false for unordered.
3238  */
3239 #define DO_GE16(X, Y, S) float16_le(Y, X, S)
3240 #define DO_GE32(X, Y, S) float32_le(Y, X, S)
3241 #define DO_GT16(X, Y, S) float16_lt(Y, X, S)
3242 #define DO_GT32(X, Y, S) float32_lt(Y, X, S)
3243 
3244 DO_VCMP_FP_BOTH(vfcmpeqh, vfcmpeq_scalarh, 2, float16, float16_eq)
3245 DO_VCMP_FP_BOTH(vfcmpeqs, vfcmpeq_scalars, 4, float32, float32_eq)
3246 
3247 DO_VCMP_FP_BOTH(vfcmpneh, vfcmpne_scalarh, 2, float16, !float16_eq)
3248 DO_VCMP_FP_BOTH(vfcmpnes, vfcmpne_scalars, 4, float32, !float32_eq)
3249 
3250 DO_VCMP_FP_BOTH(vfcmpgeh, vfcmpge_scalarh, 2, float16, DO_GE16)
3251 DO_VCMP_FP_BOTH(vfcmpges, vfcmpge_scalars, 4, float32, DO_GE32)
3252 
3253 DO_VCMP_FP_BOTH(vfcmplth, vfcmplt_scalarh, 2, float16, !DO_GE16)
3254 DO_VCMP_FP_BOTH(vfcmplts, vfcmplt_scalars, 4, float32, !DO_GE32)
3255 
3256 DO_VCMP_FP_BOTH(vfcmpgth, vfcmpgt_scalarh, 2, float16, DO_GT16)
3257 DO_VCMP_FP_BOTH(vfcmpgts, vfcmpgt_scalars, 4, float32, DO_GT32)
3258 
3259 DO_VCMP_FP_BOTH(vfcmpleh, vfcmple_scalarh, 2, float16, !DO_GT16)
3260 DO_VCMP_FP_BOTH(vfcmples, vfcmple_scalars, 4, float32, !DO_GT32)
3261 
3262 #define DO_VCVT_FIXED(OP, ESIZE, TYPE, FN)                              \
3263     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm,   \
3264                                 uint32_t shift)                         \
3265     {                                                                   \
3266         TYPE *d = vd, *m = vm;                                          \
3267         TYPE r;                                                         \
3268         uint16_t mask = mve_element_mask(env);                          \
3269         unsigned e;                                                     \
3270         float_status *fpst;                                             \
3271         float_status scratch_fpst;                                      \
3272         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
3273             if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
3274                 continue;                                               \
3275             }                                                           \
3276             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3277             if (!(mask & 1)) {                                          \
3278                 /* We need the result but without updating flags */     \
3279                 scratch_fpst = *fpst;                                   \
3280                 fpst = &scratch_fpst;                                   \
3281             }                                                           \
3282             r = FN(m[H##ESIZE(e)], shift, fpst);                        \
3283             mergemask(&d[H##ESIZE(e)], r, mask);                        \
3284         }                                                               \
3285         mve_advance_vpt(env);                                           \
3286     }
3287 
3288 DO_VCVT_FIXED(vcvt_sh, 2, int16_t, helper_vfp_shtoh)
3289 DO_VCVT_FIXED(vcvt_uh, 2, uint16_t, helper_vfp_uhtoh)
3290 DO_VCVT_FIXED(vcvt_hs, 2, int16_t, helper_vfp_toshh_round_to_zero)
3291 DO_VCVT_FIXED(vcvt_hu, 2, uint16_t, helper_vfp_touhh_round_to_zero)
3292 DO_VCVT_FIXED(vcvt_sf, 4, int32_t, helper_vfp_sltos)
3293 DO_VCVT_FIXED(vcvt_uf, 4, uint32_t, helper_vfp_ultos)
3294 DO_VCVT_FIXED(vcvt_fs, 4, int32_t, helper_vfp_tosls_round_to_zero)
3295 DO_VCVT_FIXED(vcvt_fu, 4, uint32_t, helper_vfp_touls_round_to_zero)
3296 
3297 /* VCVT with specified rmode */
3298 #define DO_VCVT_RMODE(OP, ESIZE, TYPE, FN)                              \
3299     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
3300                                 void *vd, void *vm, uint32_t rmode)     \
3301     {                                                                   \
3302         TYPE *d = vd, *m = vm;                                          \
3303         TYPE r;                                                         \
3304         uint16_t mask = mve_element_mask(env);                          \
3305         unsigned e;                                                     \
3306         float_status *fpst;                                             \
3307         float_status scratch_fpst;                                      \
3308         float_status *base_fpst =                                       \
3309             &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD];  \
3310         uint32_t prev_rmode = get_float_rounding_mode(base_fpst);       \
3311         set_float_rounding_mode(rmode, base_fpst);                      \
3312         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
3313             if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
3314                 continue;                                               \
3315             }                                                           \
3316             fpst = base_fpst;                                           \
3317             if (!(mask & 1)) {                                          \
3318                 /* We need the result but without updating flags */     \
3319                 scratch_fpst = *fpst;                                   \
3320                 fpst = &scratch_fpst;                                   \
3321             }                                                           \
3322             r = FN(m[H##ESIZE(e)], 0, fpst);                            \
3323             mergemask(&d[H##ESIZE(e)], r, mask);                        \
3324         }                                                               \
3325         set_float_rounding_mode(prev_rmode, base_fpst);                 \
3326         mve_advance_vpt(env);                                           \
3327     }
3328 
3329 DO_VCVT_RMODE(vcvt_rm_sh, 2, uint16_t, helper_vfp_toshh)
3330 DO_VCVT_RMODE(vcvt_rm_uh, 2, uint16_t, helper_vfp_touhh)
3331 DO_VCVT_RMODE(vcvt_rm_ss, 4, uint32_t, helper_vfp_tosls)
3332 DO_VCVT_RMODE(vcvt_rm_us, 4, uint32_t, helper_vfp_touls)
3333 
3334 #define DO_VRINT_RM_H(M, F, S) helper_rinth(M, S)
3335 #define DO_VRINT_RM_S(M, F, S) helper_rints(M, S)
3336 
3337 DO_VCVT_RMODE(vrint_rm_h, 2, uint16_t, DO_VRINT_RM_H)
3338 DO_VCVT_RMODE(vrint_rm_s, 4, uint32_t, DO_VRINT_RM_S)
3339 
3340 /*
3341  * VCVT between halfprec and singleprec. As usual for halfprec
3342  * conversions, FZ16 is ignored and AHP is observed.
3343  */
3344 static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top)
3345 {
3346     uint16_t *d = vd;
3347     uint32_t *m = vm;
3348     uint16_t r;
3349     uint16_t mask = mve_element_mask(env);
3350     bool ieee = !(env->vfp.fpcr & FPCR_AHP);
3351     unsigned e;
3352     float_status *fpst;
3353     float_status scratch_fpst;
3354     float_status *base_fpst = &env->vfp.fp_status[FPST_STD];
3355     bool old_fz = get_flush_to_zero(base_fpst);
3356     set_flush_to_zero(false, base_fpst);
3357     for (e = 0; e < 16 / 4; e++, mask >>= 4) {
3358         if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
3359             continue;
3360         }
3361         fpst = base_fpst;
3362         if (!(mask & 1)) {
3363             /* We need the result but without updating flags */
3364             scratch_fpst = *fpst;
3365             fpst = &scratch_fpst;
3366         }
3367         r = float32_to_float16(m[H4(e)], ieee, fpst);
3368         mergemask(&d[H2(e * 2 + top)], r, mask >> (top * 2));
3369     }
3370     set_flush_to_zero(old_fz, base_fpst);
3371     mve_advance_vpt(env);
3372 }
3373 
3374 static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top)
3375 {
3376     uint32_t *d = vd;
3377     uint16_t *m = vm;
3378     uint32_t r;
3379     uint16_t mask = mve_element_mask(env);
3380     bool ieee = !(env->vfp.fpcr & FPCR_AHP);
3381     unsigned e;
3382     float_status *fpst;
3383     float_status scratch_fpst;
3384     float_status *base_fpst = &env->vfp.fp_status[FPST_STD];
3385     bool old_fiz = get_flush_inputs_to_zero(base_fpst);
3386     set_flush_inputs_to_zero(false, base_fpst);
3387     for (e = 0; e < 16 / 4; e++, mask >>= 4) {
3388         if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
3389             continue;
3390         }
3391         fpst = base_fpst;
3392         if (!(mask & (1 << (top * 2)))) {
3393             /* We need the result but without updating flags */
3394             scratch_fpst = *fpst;
3395             fpst = &scratch_fpst;
3396         }
3397         r = float16_to_float32(m[H2(e * 2 + top)], ieee, fpst);
3398         mergemask(&d[H4(e)], r, mask);
3399     }
3400     set_flush_inputs_to_zero(old_fiz, base_fpst);
3401     mve_advance_vpt(env);
3402 }
3403 
3404 void HELPER(mve_vcvtb_sh)(CPUARMState *env, void *vd, void *vm)
3405 {
3406     do_vcvt_sh(env, vd, vm, 0);
3407 }
3408 void HELPER(mve_vcvtt_sh)(CPUARMState *env, void *vd, void *vm)
3409 {
3410     do_vcvt_sh(env, vd, vm, 1);
3411 }
3412 void HELPER(mve_vcvtb_hs)(CPUARMState *env, void *vd, void *vm)
3413 {
3414     do_vcvt_hs(env, vd, vm, 0);
3415 }
3416 void HELPER(mve_vcvtt_hs)(CPUARMState *env, void *vd, void *vm)
3417 {
3418     do_vcvt_hs(env, vd, vm, 1);
3419 }
3420 
3421 #define DO_1OP_FP(OP, ESIZE, TYPE, FN)                                  \
3422     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm)   \
3423     {                                                                   \
3424         TYPE *d = vd, *m = vm;                                          \
3425         TYPE r;                                                         \
3426         uint16_t mask = mve_element_mask(env);                          \
3427         unsigned e;                                                     \
3428         float_status *fpst;                                             \
3429         float_status scratch_fpst;                                      \
3430         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
3431             if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) {              \
3432                 continue;                                               \
3433             }                                                           \
3434             fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \
3435             if (!(mask & 1)) {                                          \
3436                 /* We need the result but without updating flags */     \
3437                 scratch_fpst = *fpst;                                   \
3438                 fpst = &scratch_fpst;                                   \
3439             }                                                           \
3440             r = FN(m[H##ESIZE(e)], fpst);                               \
3441             mergemask(&d[H##ESIZE(e)], r, mask);                        \
3442         }                                                               \
3443         mve_advance_vpt(env);                                           \
3444     }
3445 
3446 DO_1OP_FP(vrintx_h, 2, float16, float16_round_to_int)
3447 DO_1OP_FP(vrintx_s, 4, float32, float32_round_to_int)
3448