xref: /qemu/target/arm/tcg/mve_helper.c (revision e3152d02da21ac6e2169b1bf104a2d0478664a4a)
1 /*
2  * M-profile MVE Operations
3  *
4  * Copyright (c) 2021 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "vec_internal.h"
24 #include "exec/helper-proto.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/exec-all.h"
27 #include "tcg/tcg.h"
28 
29 static uint16_t mve_eci_mask(CPUARMState *env)
30 {
31     /*
32      * Return the mask of which elements in the MVE vector correspond
33      * to beats being executed. The mask has 1 bits for executed lanes
34      * and 0 bits where ECI says this beat was already executed.
35      */
36     int eci;
37 
38     if ((env->condexec_bits & 0xf) != 0) {
39         return 0xffff;
40     }
41 
42     eci = env->condexec_bits >> 4;
43     switch (eci) {
44     case ECI_NONE:
45         return 0xffff;
46     case ECI_A0:
47         return 0xfff0;
48     case ECI_A0A1:
49         return 0xff00;
50     case ECI_A0A1A2:
51     case ECI_A0A1A2B0:
52         return 0xf000;
53     default:
54         g_assert_not_reached();
55     }
56 }
57 
58 static uint16_t mve_element_mask(CPUARMState *env)
59 {
60     /*
61      * Return the mask of which elements in the MVE vector should be
62      * updated. This is a combination of multiple things:
63      *  (1) by default, we update every lane in the vector
64      *  (2) VPT predication stores its state in the VPR register;
65      *  (3) low-overhead-branch tail predication will mask out part
66      *      the vector on the final iteration of the loop
67      *  (4) if EPSR.ECI is set then we must execute only some beats
68      *      of the insn
69      * We combine all these into a 16-bit result with the same semantics
70      * as VPR.P0: 0 to mask the lane, 1 if it is active.
71      * 8-bit vector ops will look at all bits of the result;
72      * 16-bit ops will look at bits 0, 2, 4, ...;
73      * 32-bit ops will look at bits 0, 4, 8 and 12.
74      * Compare pseudocode GetCurInstrBeat(), though that only returns
75      * the 4-bit slice of the mask corresponding to a single beat.
76      */
77     uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
78 
79     if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {
80         mask |= 0xff;
81     }
82     if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {
83         mask |= 0xff00;
84     }
85 
86     if (env->v7m.ltpsize < 4 &&
87         env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {
88         /*
89          * Tail predication active, and this is the last loop iteration.
90          * The element size is (1 << ltpsize), and we only want to process
91          * loopcount elements, so we want to retain the least significant
92          * (loopcount * esize) predicate bits and zero out bits above that.
93          */
94         int masklen = env->regs[14] << env->v7m.ltpsize;
95         assert(masklen <= 16);
96         uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
97         mask &= ltpmask;
98     }
99 
100     /*
101      * ECI bits indicate which beats are already executed;
102      * we handle this by effectively predicating them out.
103      */
104     mask &= mve_eci_mask(env);
105     return mask;
106 }
107 
108 static void mve_advance_vpt(CPUARMState *env)
109 {
110     /* Advance the VPT and ECI state if necessary */
111     uint32_t vpr = env->v7m.vpr;
112     unsigned mask01, mask23;
113     uint16_t inv_mask;
114     uint16_t eci_mask = mve_eci_mask(env);
115 
116     if ((env->condexec_bits & 0xf) == 0) {
117         env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?
118             (ECI_A0 << 4) : (ECI_NONE << 4);
119     }
120 
121     if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) {
122         /* VPT not enabled, nothing to do */
123         return;
124     }
125 
126     /* Invert P0 bits if needed, but only for beats we actually executed */
127     mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01);
128     mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23);
129     /* Start by assuming we invert all bits corresponding to executed beats */
130     inv_mask = eci_mask;
131     if (mask01 <= 8) {
132         /* MASK01 says don't invert low half of P0 */
133         inv_mask &= ~0xff;
134     }
135     if (mask23 <= 8) {
136         /* MASK23 says don't invert high half of P0 */
137         inv_mask &= ~0xff00;
138     }
139     vpr ^= inv_mask;
140     /* Only update MASK01 if beat 1 executed */
141     if (eci_mask & 0xf0) {
142         vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1);
143     }
144     /* Beat 3 always executes, so update MASK23 */
145     vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1);
146     env->v7m.vpr = vpr;
147 }
148 
149 
150 #define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE)                         \
151     void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr)    \
152     {                                                                   \
153         TYPE *d = vd;                                                   \
154         uint16_t mask = mve_element_mask(env);                          \
155         unsigned b, e;                                                  \
156         /*                                                              \
157          * R_SXTM allows the dest reg to become UNKNOWN for abandoned   \
158          * beats so we don't care if we update part of the dest and     \
159          * then take an exception.                                      \
160          */                                                             \
161         for (b = 0, e = 0; b < 16; b += ESIZE, e++) {                   \
162             if (mask & (1 << b)) {                                      \
163                 d[H##ESIZE(e)] = cpu_##LDTYPE##_data_ra(env, addr, GETPC()); \
164             }                                                           \
165             addr += MSIZE;                                              \
166         }                                                               \
167         mve_advance_vpt(env);                                           \
168     }
169 
170 #define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE)                         \
171     void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr)    \
172     {                                                                   \
173         TYPE *d = vd;                                                   \
174         uint16_t mask = mve_element_mask(env);                          \
175         unsigned b, e;                                                  \
176         for (b = 0, e = 0; b < 16; b += ESIZE, e++) {                   \
177             if (mask & (1 << b)) {                                      \
178                 cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
179             }                                                           \
180             addr += MSIZE;                                              \
181         }                                                               \
182         mve_advance_vpt(env);                                           \
183     }
184 
185 DO_VLDR(vldrb, 1, ldub, 1, uint8_t)
186 DO_VLDR(vldrh, 2, lduw, 2, uint16_t)
187 DO_VLDR(vldrw, 4, ldl, 4, uint32_t)
188 
189 DO_VSTR(vstrb, 1, stb, 1, uint8_t)
190 DO_VSTR(vstrh, 2, stw, 2, uint16_t)
191 DO_VSTR(vstrw, 4, stl, 4, uint32_t)
192 
193 DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t)
194 DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t)
195 DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t)
196 DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t)
197 DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t)
198 DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t)
199 
200 DO_VSTR(vstrb_h, 1, stb, 2, int16_t)
201 DO_VSTR(vstrb_w, 1, stb, 4, int32_t)
202 DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
203 
204 #undef DO_VLDR
205 #undef DO_VSTR
206 
207 /*
208  * The mergemask(D, R, M) macro performs the operation "*D = R" but
209  * storing only the bytes which correspond to 1 bits in M,
210  * leaving other bytes in *D unchanged. We use _Generic
211  * to select the correct implementation based on the type of D.
212  */
213 
214 static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask)
215 {
216     if (mask & 1) {
217         *d = r;
218     }
219 }
220 
221 static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask)
222 {
223     mergemask_ub((uint8_t *)d, r, mask);
224 }
225 
226 static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask)
227 {
228     uint16_t bmask = expand_pred_b_data[mask & 3];
229     *d = (*d & ~bmask) | (r & bmask);
230 }
231 
232 static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask)
233 {
234     mergemask_uh((uint16_t *)d, r, mask);
235 }
236 
237 static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask)
238 {
239     uint32_t bmask = expand_pred_b_data[mask & 0xf];
240     *d = (*d & ~bmask) | (r & bmask);
241 }
242 
243 static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask)
244 {
245     mergemask_uw((uint32_t *)d, r, mask);
246 }
247 
248 static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask)
249 {
250     uint64_t bmask = expand_pred_b_data[mask & 0xff];
251     *d = (*d & ~bmask) | (r & bmask);
252 }
253 
254 static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask)
255 {
256     mergemask_uq((uint64_t *)d, r, mask);
257 }
258 
259 #define mergemask(D, R, M)                      \
260     _Generic(D,                                 \
261              uint8_t *: mergemask_ub,           \
262              int8_t *:  mergemask_sb,           \
263              uint16_t *: mergemask_uh,          \
264              int16_t *:  mergemask_sh,          \
265              uint32_t *: mergemask_uw,          \
266              int32_t *:  mergemask_sw,          \
267              uint64_t *: mergemask_uq,          \
268              int64_t *:  mergemask_sq)(D, R, M)
269 
270 void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val)
271 {
272     /*
273      * The generated code already replicated an 8 or 16 bit constant
274      * into the 32-bit value, so we only need to write the 32-bit
275      * value to all elements of the Qreg, allowing for predication.
276      */
277     uint32_t *d = vd;
278     uint16_t mask = mve_element_mask(env);
279     unsigned e;
280     for (e = 0; e < 16 / 4; e++, mask >>= 4) {
281         mergemask(&d[H4(e)], val, mask);
282     }
283     mve_advance_vpt(env);
284 }
285 
286 #define DO_1OP(OP, ESIZE, TYPE, FN)                                     \
287     void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm)         \
288     {                                                                   \
289         TYPE *d = vd, *m = vm;                                          \
290         uint16_t mask = mve_element_mask(env);                          \
291         unsigned e;                                                     \
292         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
293             mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask);       \
294         }                                                               \
295         mve_advance_vpt(env);                                           \
296     }
297 
298 #define DO_CLS_B(N)   (clrsb32(N) - 24)
299 #define DO_CLS_H(N)   (clrsb32(N) - 16)
300 
301 DO_1OP(vclsb, 1, int8_t, DO_CLS_B)
302 DO_1OP(vclsh, 2, int16_t, DO_CLS_H)
303 DO_1OP(vclsw, 4, int32_t, clrsb32)
304 
305 #define DO_CLZ_B(N)   (clz32(N) - 24)
306 #define DO_CLZ_H(N)   (clz32(N) - 16)
307 
308 DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B)
309 DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H)
310 DO_1OP(vclzw, 4, uint32_t, clz32)
311 
312 DO_1OP(vrev16b, 2, uint16_t, bswap16)
313 DO_1OP(vrev32b, 4, uint32_t, bswap32)
314 DO_1OP(vrev32h, 4, uint32_t, hswap32)
315 DO_1OP(vrev64b, 8, uint64_t, bswap64)
316 DO_1OP(vrev64h, 8, uint64_t, hswap64)
317 DO_1OP(vrev64w, 8, uint64_t, wswap64)
318 
319 #define DO_NOT(N) (~(N))
320 
321 DO_1OP(vmvn, 8, uint64_t, DO_NOT)
322 
323 #define DO_ABS(N) ((N) < 0 ? -(N) : (N))
324 #define DO_FABSH(N)  ((N) & dup_const(MO_16, 0x7fff))
325 #define DO_FABSS(N)  ((N) & dup_const(MO_32, 0x7fffffff))
326 
327 DO_1OP(vabsb, 1, int8_t, DO_ABS)
328 DO_1OP(vabsh, 2, int16_t, DO_ABS)
329 DO_1OP(vabsw, 4, int32_t, DO_ABS)
330 
331 /* We can do these 64 bits at a time */
332 DO_1OP(vfabsh, 8, uint64_t, DO_FABSH)
333 DO_1OP(vfabss, 8, uint64_t, DO_FABSS)
334 
335 #define DO_NEG(N)    (-(N))
336 #define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000))
337 #define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000))
338 
339 DO_1OP(vnegb, 1, int8_t, DO_NEG)
340 DO_1OP(vnegh, 2, int16_t, DO_NEG)
341 DO_1OP(vnegw, 4, int32_t, DO_NEG)
342 
343 /* We can do these 64 bits at a time */
344 DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH)
345 DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS)
346 
347 /*
348  * 1 operand immediates: Vda is destination and possibly also one source.
349  * All these insns work at 64-bit widths.
350  */
351 #define DO_1OP_IMM(OP, FN)                                              \
352     void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm)    \
353     {                                                                   \
354         uint64_t *da = vda;                                             \
355         uint16_t mask = mve_element_mask(env);                          \
356         unsigned e;                                                     \
357         for (e = 0; e < 16 / 8; e++, mask >>= 8) {                      \
358             mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask);            \
359         }                                                               \
360         mve_advance_vpt(env);                                           \
361     }
362 
363 #define DO_MOVI(N, I) (I)
364 #define DO_ANDI(N, I) ((N) & (I))
365 #define DO_ORRI(N, I) ((N) | (I))
366 
367 DO_1OP_IMM(vmovi, DO_MOVI)
368 DO_1OP_IMM(vandi, DO_ANDI)
369 DO_1OP_IMM(vorri, DO_ORRI)
370 
371 #define DO_2OP(OP, ESIZE, TYPE, FN)                                     \
372     void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
373                                 void *vd, void *vn, void *vm)           \
374     {                                                                   \
375         TYPE *d = vd, *n = vn, *m = vm;                                 \
376         uint16_t mask = mve_element_mask(env);                          \
377         unsigned e;                                                     \
378         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
379             mergemask(&d[H##ESIZE(e)],                                  \
380                       FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask);        \
381         }                                                               \
382         mve_advance_vpt(env);                                           \
383     }
384 
385 /* provide unsigned 2-op helpers for all sizes */
386 #define DO_2OP_U(OP, FN)                        \
387     DO_2OP(OP##b, 1, uint8_t, FN)               \
388     DO_2OP(OP##h, 2, uint16_t, FN)              \
389     DO_2OP(OP##w, 4, uint32_t, FN)
390 
391 /* provide signed 2-op helpers for all sizes */
392 #define DO_2OP_S(OP, FN)                        \
393     DO_2OP(OP##b, 1, int8_t, FN)                \
394     DO_2OP(OP##h, 2, int16_t, FN)               \
395     DO_2OP(OP##w, 4, int32_t, FN)
396 
397 /*
398  * "Long" operations where two half-sized inputs (taken from either the
399  * top or the bottom of the input vector) produce a double-width result.
400  * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output.
401  */
402 #define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)               \
403     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
404     {                                                                   \
405         LTYPE *d = vd;                                                  \
406         TYPE *n = vn, *m = vm;                                          \
407         uint16_t mask = mve_element_mask(env);                          \
408         unsigned le;                                                    \
409         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
410             LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)],              \
411                          m[H##ESIZE(le * 2 + TOP)]);                    \
412             mergemask(&d[H##LESIZE(le)], r, mask);                      \
413         }                                                               \
414         mve_advance_vpt(env);                                           \
415     }
416 
417 #define DO_2OP_SAT(OP, ESIZE, TYPE, FN)                                 \
418     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
419     {                                                                   \
420         TYPE *d = vd, *n = vn, *m = vm;                                 \
421         uint16_t mask = mve_element_mask(env);                          \
422         unsigned e;                                                     \
423         bool qc = false;                                                \
424         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
425             bool sat = false;                                           \
426             TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat);          \
427             mergemask(&d[H##ESIZE(e)], r, mask);                        \
428             qc |= sat & mask & 1;                                       \
429         }                                                               \
430         if (qc) {                                                       \
431             env->vfp.qc[0] = qc;                                        \
432         }                                                               \
433         mve_advance_vpt(env);                                           \
434     }
435 
436 /* provide unsigned 2-op helpers for all sizes */
437 #define DO_2OP_SAT_U(OP, FN)                    \
438     DO_2OP_SAT(OP##b, 1, uint8_t, FN)           \
439     DO_2OP_SAT(OP##h, 2, uint16_t, FN)          \
440     DO_2OP_SAT(OP##w, 4, uint32_t, FN)
441 
442 /* provide signed 2-op helpers for all sizes */
443 #define DO_2OP_SAT_S(OP, FN)                    \
444     DO_2OP_SAT(OP##b, 1, int8_t, FN)            \
445     DO_2OP_SAT(OP##h, 2, int16_t, FN)           \
446     DO_2OP_SAT(OP##w, 4, int32_t, FN)
447 
448 #define DO_AND(N, M)  ((N) & (M))
449 #define DO_BIC(N, M)  ((N) & ~(M))
450 #define DO_ORR(N, M)  ((N) | (M))
451 #define DO_ORN(N, M)  ((N) | ~(M))
452 #define DO_EOR(N, M)  ((N) ^ (M))
453 
454 DO_2OP(vand, 8, uint64_t, DO_AND)
455 DO_2OP(vbic, 8, uint64_t, DO_BIC)
456 DO_2OP(vorr, 8, uint64_t, DO_ORR)
457 DO_2OP(vorn, 8, uint64_t, DO_ORN)
458 DO_2OP(veor, 8, uint64_t, DO_EOR)
459 
460 #define DO_ADD(N, M) ((N) + (M))
461 #define DO_SUB(N, M) ((N) - (M))
462 #define DO_MUL(N, M) ((N) * (M))
463 
464 DO_2OP_U(vadd, DO_ADD)
465 DO_2OP_U(vsub, DO_SUB)
466 DO_2OP_U(vmul, DO_MUL)
467 
468 DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL)
469 DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL)
470 DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL)
471 DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL)
472 DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL)
473 DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL)
474 
475 DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL)
476 DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL)
477 DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL)
478 DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL)
479 DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL)
480 DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
481 
482 /*
483  * Because the computation type is at least twice as large as required,
484  * these work for both signed and unsigned source types.
485  */
486 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
487 {
488     return (n * m) >> 8;
489 }
490 
491 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
492 {
493     return (n * m) >> 16;
494 }
495 
496 static inline uint32_t do_mulh_w(int64_t n, int64_t m)
497 {
498     return (n * m) >> 32;
499 }
500 
501 static inline uint8_t do_rmulh_b(int32_t n, int32_t m)
502 {
503     return (n * m + (1U << 7)) >> 8;
504 }
505 
506 static inline uint16_t do_rmulh_h(int32_t n, int32_t m)
507 {
508     return (n * m + (1U << 15)) >> 16;
509 }
510 
511 static inline uint32_t do_rmulh_w(int64_t n, int64_t m)
512 {
513     return (n * m + (1U << 31)) >> 32;
514 }
515 
516 DO_2OP(vmulhsb, 1, int8_t, do_mulh_b)
517 DO_2OP(vmulhsh, 2, int16_t, do_mulh_h)
518 DO_2OP(vmulhsw, 4, int32_t, do_mulh_w)
519 DO_2OP(vmulhub, 1, uint8_t, do_mulh_b)
520 DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h)
521 DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w)
522 
523 DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b)
524 DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h)
525 DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w)
526 DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b)
527 DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h)
528 DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w)
529 
530 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
531 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
532 
533 DO_2OP_S(vmaxs, DO_MAX)
534 DO_2OP_U(vmaxu, DO_MAX)
535 DO_2OP_S(vmins, DO_MIN)
536 DO_2OP_U(vminu, DO_MIN)
537 
538 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
539 
540 DO_2OP_S(vabds, DO_ABD)
541 DO_2OP_U(vabdu, DO_ABD)
542 
543 static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m)
544 {
545     return ((uint64_t)n + m) >> 1;
546 }
547 
548 static inline int32_t do_vhadd_s(int32_t n, int32_t m)
549 {
550     return ((int64_t)n + m) >> 1;
551 }
552 
553 static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m)
554 {
555     return ((uint64_t)n - m) >> 1;
556 }
557 
558 static inline int32_t do_vhsub_s(int32_t n, int32_t m)
559 {
560     return ((int64_t)n - m) >> 1;
561 }
562 
563 DO_2OP_S(vhadds, do_vhadd_s)
564 DO_2OP_U(vhaddu, do_vhadd_u)
565 DO_2OP_S(vhsubs, do_vhsub_s)
566 DO_2OP_U(vhsubu, do_vhsub_u)
567 
568 #define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
569 #define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
570 #define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
571 #define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
572 
573 DO_2OP_S(vshls, DO_VSHLS)
574 DO_2OP_U(vshlu, DO_VSHLU)
575 DO_2OP_S(vrshls, DO_VRSHLS)
576 DO_2OP_U(vrshlu, DO_VRSHLU)
577 
578 #define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1)
579 #define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1)
580 
581 DO_2OP_S(vrhadds, DO_RHADD_S)
582 DO_2OP_U(vrhaddu, DO_RHADD_U)
583 
584 static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m,
585                     uint32_t inv, uint32_t carry_in, bool update_flags)
586 {
587     uint16_t mask = mve_element_mask(env);
588     unsigned e;
589 
590     /* If any additions trigger, we will update flags. */
591     if (mask & 0x1111) {
592         update_flags = true;
593     }
594 
595     for (e = 0; e < 16 / 4; e++, mask >>= 4) {
596         uint64_t r = carry_in;
597         r += n[H4(e)];
598         r += m[H4(e)] ^ inv;
599         if (mask & 1) {
600             carry_in = r >> 32;
601         }
602         mergemask(&d[H4(e)], r, mask);
603     }
604 
605     if (update_flags) {
606         /* Store C, clear NZV. */
607         env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK;
608         env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C;
609     }
610     mve_advance_vpt(env);
611 }
612 
613 void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm)
614 {
615     bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
616     do_vadc(env, vd, vn, vm, 0, carry_in, false);
617 }
618 
619 void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm)
620 {
621     bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
622     do_vadc(env, vd, vn, vm, -1, carry_in, false);
623 }
624 
625 
626 void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm)
627 {
628     do_vadc(env, vd, vn, vm, 0, 0, true);
629 }
630 
631 void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm)
632 {
633     do_vadc(env, vd, vn, vm, -1, 1, true);
634 }
635 
636 #define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1)                             \
637     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
638     {                                                                   \
639         TYPE *d = vd, *n = vn, *m = vm;                                 \
640         uint16_t mask = mve_element_mask(env);                          \
641         unsigned e;                                                     \
642         TYPE r[16 / ESIZE];                                             \
643         /* Calculate all results first to avoid overwriting inputs */   \
644         for (e = 0; e < 16 / ESIZE; e++) {                              \
645             if (!(e & 1)) {                                             \
646                 r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]);         \
647             } else {                                                    \
648                 r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]);         \
649             }                                                           \
650         }                                                               \
651         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
652             mergemask(&d[H##ESIZE(e)], r[e], mask);                     \
653         }                                                               \
654         mve_advance_vpt(env);                                           \
655     }
656 
657 #define DO_VCADD_ALL(OP, FN0, FN1)              \
658     DO_VCADD(OP##b, 1, int8_t, FN0, FN1)        \
659     DO_VCADD(OP##h, 2, int16_t, FN0, FN1)       \
660     DO_VCADD(OP##w, 4, int32_t, FN0, FN1)
661 
662 DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD)
663 DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB)
664 DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s)
665 DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s)
666 
667 static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s)
668 {
669     if (val > max) {
670         *s = true;
671         return max;
672     } else if (val < min) {
673         *s = true;
674         return min;
675     }
676     return val;
677 }
678 
679 #define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s)
680 #define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s)
681 #define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s)
682 
683 #define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s)
684 #define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s)
685 #define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s)
686 
687 #define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
688 #define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
689 #define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
690 
691 #define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
692 #define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
693 #define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
694 
695 /*
696  * For QDMULH and QRDMULH we simplify "double and shift by esize" into
697  * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
698  */
699 #define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \
700                                         INT8_MIN, INT8_MAX, s)
701 #define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \
702                                         INT16_MIN, INT16_MAX, s)
703 #define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \
704                                         INT32_MIN, INT32_MAX, s)
705 
706 #define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \
707                                          INT8_MIN, INT8_MAX, s)
708 #define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \
709                                          INT16_MIN, INT16_MAX, s)
710 #define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \
711                                          INT32_MIN, INT32_MAX, s)
712 
713 DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B)
714 DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H)
715 DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W)
716 
717 DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B)
718 DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H)
719 DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W)
720 
721 DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B)
722 DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H)
723 DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W)
724 DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B)
725 DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H)
726 DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W)
727 
728 DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B)
729 DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H)
730 DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W)
731 DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B)
732 DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H)
733 DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W)
734 
735 /*
736  * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs()
737  * and friends wanting a uint32_t* sat and our needing a bool*.
738  */
739 #define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp)                        \
740     ({                                                                  \
741         uint32_t su32 = 0;                                              \
742         typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32);  \
743         if (su32) {                                                     \
744             *satp = true;                                               \
745         }                                                               \
746         r;                                                              \
747     })
748 
749 #define DO_SQSHL_OP(N, M, satp) \
750     WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp)
751 #define DO_UQSHL_OP(N, M, satp) \
752     WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp)
753 #define DO_SQRSHL_OP(N, M, satp) \
754     WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
755 #define DO_UQRSHL_OP(N, M, satp) \
756     WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
757 #define DO_SUQSHL_OP(N, M, satp) \
758     WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp)
759 
760 DO_2OP_SAT_S(vqshls, DO_SQSHL_OP)
761 DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP)
762 DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP)
763 DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP)
764 
765 /*
766  * Multiply add dual returning high half
767  * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of
768  * whether to add the rounding constant, and the pointer to the
769  * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant",
770  * saturate to twice the input size and return the high half; or
771  * (A * B - C * D) etc for VQDMLSDH.
772  */
773 #define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN)                \
774     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
775                                 void *vm)                               \
776     {                                                                   \
777         TYPE *d = vd, *n = vn, *m = vm;                                 \
778         uint16_t mask = mve_element_mask(env);                          \
779         unsigned e;                                                     \
780         bool qc = false;                                                \
781         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
782             bool sat = false;                                           \
783             if ((e & 1) == XCHG) {                                      \
784                 TYPE r = FN(n[H##ESIZE(e)],                             \
785                             m[H##ESIZE(e - XCHG)],                      \
786                             n[H##ESIZE(e + (1 - 2 * XCHG))],            \
787                             m[H##ESIZE(e + (1 - XCHG))],                \
788                             ROUND, &sat);                               \
789                 mergemask(&d[H##ESIZE(e)], r, mask);                    \
790                 qc |= sat & mask & 1;                                   \
791             }                                                           \
792         }                                                               \
793         if (qc) {                                                       \
794             env->vfp.qc[0] = qc;                                        \
795         }                                                               \
796         mve_advance_vpt(env);                                           \
797     }
798 
799 static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d,
800                             int round, bool *sat)
801 {
802     int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7);
803     return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
804 }
805 
806 static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d,
807                              int round, bool *sat)
808 {
809     int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15);
810     return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
811 }
812 
813 static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d,
814                              int round, bool *sat)
815 {
816     int64_t m1 = (int64_t)a * b;
817     int64_t m2 = (int64_t)c * d;
818     int64_t r;
819     /*
820      * Architecturally we should do the entire add, double, round
821      * and then check for saturation. We do three saturating adds,
822      * but we need to be careful about the order. If the first
823      * m1 + m2 saturates then it's impossible for the *2+rc to
824      * bring it back into the non-saturated range. However, if
825      * m1 + m2 is negative then it's possible that doing the doubling
826      * would take the intermediate result below INT64_MAX and the
827      * addition of the rounding constant then brings it back in range.
828      * So we add half the rounding constant before doubling rather
829      * than adding the rounding constant after the doubling.
830      */
831     if (sadd64_overflow(m1, m2, &r) ||
832         sadd64_overflow(r, (round << 30), &r) ||
833         sadd64_overflow(r, r, &r)) {
834         *sat = true;
835         return r < 0 ? INT32_MAX : INT32_MIN;
836     }
837     return r >> 32;
838 }
839 
840 static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d,
841                             int round, bool *sat)
842 {
843     int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);
844     return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
845 }
846 
847 static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d,
848                              int round, bool *sat)
849 {
850     int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);
851     return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
852 }
853 
854 static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d,
855                              int round, bool *sat)
856 {
857     int64_t m1 = (int64_t)a * b;
858     int64_t m2 = (int64_t)c * d;
859     int64_t r;
860     /* The same ordering issue as in do_vqdmladh_w applies here too */
861     if (ssub64_overflow(m1, m2, &r) ||
862         sadd64_overflow(r, (round << 30), &r) ||
863         sadd64_overflow(r, r, &r)) {
864         *sat = true;
865         return r < 0 ? INT32_MAX : INT32_MIN;
866     }
867     return r >> 32;
868 }
869 
870 DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b)
871 DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h)
872 DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w)
873 DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b)
874 DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h)
875 DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w)
876 
877 DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b)
878 DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h)
879 DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w)
880 DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b)
881 DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h)
882 DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w)
883 
884 DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b)
885 DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h)
886 DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w)
887 DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b)
888 DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h)
889 DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w)
890 
891 DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b)
892 DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h)
893 DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w)
894 DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b)
895 DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h)
896 DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
897 
898 #define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN)                              \
899     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
900                                 uint32_t rm)                            \
901     {                                                                   \
902         TYPE *d = vd, *n = vn;                                          \
903         TYPE m = rm;                                                    \
904         uint16_t mask = mve_element_mask(env);                          \
905         unsigned e;                                                     \
906         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
907             mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask);    \
908         }                                                               \
909         mve_advance_vpt(env);                                           \
910     }
911 
912 #define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN)                          \
913     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
914                                 uint32_t rm)                            \
915     {                                                                   \
916         TYPE *d = vd, *n = vn;                                          \
917         TYPE m = rm;                                                    \
918         uint16_t mask = mve_element_mask(env);                          \
919         unsigned e;                                                     \
920         bool qc = false;                                                \
921         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
922             bool sat = false;                                           \
923             mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat),     \
924                       mask);                                            \
925             qc |= sat & mask & 1;                                       \
926         }                                                               \
927         if (qc) {                                                       \
928             env->vfp.qc[0] = qc;                                        \
929         }                                                               \
930         mve_advance_vpt(env);                                           \
931     }
932 
933 /* provide unsigned 2-op scalar helpers for all sizes */
934 #define DO_2OP_SCALAR_U(OP, FN)                 \
935     DO_2OP_SCALAR(OP##b, 1, uint8_t, FN)        \
936     DO_2OP_SCALAR(OP##h, 2, uint16_t, FN)       \
937     DO_2OP_SCALAR(OP##w, 4, uint32_t, FN)
938 #define DO_2OP_SCALAR_S(OP, FN)                 \
939     DO_2OP_SCALAR(OP##b, 1, int8_t, FN)         \
940     DO_2OP_SCALAR(OP##h, 2, int16_t, FN)        \
941     DO_2OP_SCALAR(OP##w, 4, int32_t, FN)
942 
943 DO_2OP_SCALAR_U(vadd_scalar, DO_ADD)
944 DO_2OP_SCALAR_U(vsub_scalar, DO_SUB)
945 DO_2OP_SCALAR_U(vmul_scalar, DO_MUL)
946 DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s)
947 DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u)
948 DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s)
949 DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u)
950 
951 DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B)
952 DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H)
953 DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W)
954 DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B)
955 DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H)
956 DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W)
957 
958 DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B)
959 DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H)
960 DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W)
961 DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B)
962 DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H)
963 DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W)
964 
965 DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B)
966 DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H)
967 DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W)
968 DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
969 DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
970 DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
971 
972 /*
973  * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the
974  * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type.
975  * SATMASK specifies which bits of the predicate mask matter for determining
976  * whether to propagate a saturation indication into FPSCR.QC -- for
977  * the 16x16->32 case we must check only the bit corresponding to the T or B
978  * half that we used, but for the 32x32->64 case we propagate if the mask
979  * bit is set for either half.
980  */
981 #define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
982     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
983                                 uint32_t rm)                            \
984     {                                                                   \
985         LTYPE *d = vd;                                                  \
986         TYPE *n = vn;                                                   \
987         TYPE m = rm;                                                    \
988         uint16_t mask = mve_element_mask(env);                          \
989         unsigned le;                                                    \
990         bool qc = false;                                                \
991         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
992             bool sat = false;                                           \
993             LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat);    \
994             mergemask(&d[H##LESIZE(le)], r, mask);                      \
995             qc |= sat && (mask & SATMASK);                              \
996         }                                                               \
997         if (qc) {                                                       \
998             env->vfp.qc[0] = qc;                                        \
999         }                                                               \
1000         mve_advance_vpt(env);                                           \
1001     }
1002 
1003 static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat)
1004 {
1005     int64_t r = ((int64_t)n * m) * 2;
1006     return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat);
1007 }
1008 
1009 static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat)
1010 {
1011     /* The multiply can't overflow, but the doubling might */
1012     int64_t r = (int64_t)n * m;
1013     if (r > INT64_MAX / 2) {
1014         *sat = true;
1015         return INT64_MAX;
1016     } else if (r < INT64_MIN / 2) {
1017         *sat = true;
1018         return INT64_MIN;
1019     } else {
1020         return r * 2;
1021     }
1022 }
1023 
1024 #define SATMASK16B 1
1025 #define SATMASK16T (1 << 2)
1026 #define SATMASK32 ((1 << 4) | 1)
1027 
1028 DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \
1029                     do_qdmullh, SATMASK16B)
1030 DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \
1031                     do_qdmullw, SATMASK32)
1032 DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \
1033                     do_qdmullh, SATMASK16T)
1034 DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \
1035                     do_qdmullw, SATMASK32)
1036 
1037 /*
1038  * Long saturating ops
1039  */
1040 #define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK)  \
1041     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn,   \
1042                                 void *vm)                               \
1043     {                                                                   \
1044         LTYPE *d = vd;                                                  \
1045         TYPE *n = vn, *m = vm;                                          \
1046         uint16_t mask = mve_element_mask(env);                          \
1047         unsigned le;                                                    \
1048         bool qc = false;                                                \
1049         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
1050             bool sat = false;                                           \
1051             LTYPE op1 = n[H##ESIZE(le * 2 + TOP)];                      \
1052             LTYPE op2 = m[H##ESIZE(le * 2 + TOP)];                      \
1053             mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask);     \
1054             qc |= sat && (mask & SATMASK);                              \
1055         }                                                               \
1056         if (qc) {                                                       \
1057             env->vfp.qc[0] = qc;                                        \
1058         }                                                               \
1059         mve_advance_vpt(env);                                           \
1060     }
1061 
1062 DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B)
1063 DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
1064 DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T)
1065 DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
1066 
1067 static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m)
1068 {
1069     m &= 0xff;
1070     if (m == 0) {
1071         return 0;
1072     }
1073     n = revbit8(n);
1074     if (m < 8) {
1075         n >>= 8 - m;
1076     }
1077     return n;
1078 }
1079 
1080 static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m)
1081 {
1082     m &= 0xff;
1083     if (m == 0) {
1084         return 0;
1085     }
1086     n = revbit16(n);
1087     if (m < 16) {
1088         n >>= 16 - m;
1089     }
1090     return n;
1091 }
1092 
1093 static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m)
1094 {
1095     m &= 0xff;
1096     if (m == 0) {
1097         return 0;
1098     }
1099     n = revbit32(n);
1100     if (m < 32) {
1101         n >>= 32 - m;
1102     }
1103     return n;
1104 }
1105 
1106 DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb)
1107 DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh)
1108 DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw)
1109 
1110 /*
1111  * Multiply add long dual accumulate ops.
1112  */
1113 #define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC)                 \
1114     uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
1115                                     void *vm, uint64_t a)               \
1116     {                                                                   \
1117         uint16_t mask = mve_element_mask(env);                          \
1118         unsigned e;                                                     \
1119         TYPE *n = vn, *m = vm;                                          \
1120         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {              \
1121             if (mask & 1) {                                             \
1122                 if (e & 1) {                                            \
1123                     a ODDACC                                            \
1124                         (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
1125                 } else {                                                \
1126                     a EVENACC                                           \
1127                         (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
1128                 }                                                       \
1129             }                                                           \
1130         }                                                               \
1131         mve_advance_vpt(env);                                           \
1132         return a;                                                       \
1133     }
1134 
1135 DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=)
1136 DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=)
1137 DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=)
1138 DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=)
1139 
1140 DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=)
1141 DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=)
1142 
1143 DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
1144 DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
1145 DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
1146 DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
1147 
1148 /*
1149  * Rounding multiply add long dual accumulate high. In the pseudocode
1150  * this is implemented with a 72-bit internal accumulator value of which
1151  * the top 64 bits are returned. We optimize this to avoid having to
1152  * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
1153  * is squashed back into 64-bits after each beat.
1154  */
1155 #define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB)                            \
1156     uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn,         \
1157                                     void *vm, uint64_t a)               \
1158     {                                                                   \
1159         uint16_t mask = mve_element_mask(env);                          \
1160         unsigned e;                                                     \
1161         TYPE *n = vn, *m = vm;                                          \
1162         for (e = 0; e < 16 / 4; e++, mask >>= 4) {                      \
1163             if (mask & 1) {                                             \
1164                 LTYPE mul;                                              \
1165                 if (e & 1) {                                            \
1166                     mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)];        \
1167                     if (SUB) {                                          \
1168                         mul = -mul;                                     \
1169                     }                                                   \
1170                 } else {                                                \
1171                     mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)];        \
1172                 }                                                       \
1173                 mul = (mul >> 8) + ((mul >> 7) & 1);                    \
1174                 a += mul;                                               \
1175             }                                                           \
1176         }                                                               \
1177         mve_advance_vpt(env);                                           \
1178         return a;                                                       \
1179     }
1180 
1181 DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false)
1182 DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false)
1183 
1184 DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false)
1185 
1186 DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true)
1187 DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true)
1188 
1189 /* Vector add across vector */
1190 #define DO_VADDV(OP, ESIZE, TYPE)                               \
1191     uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
1192                                     uint32_t ra)                \
1193     {                                                           \
1194         uint16_t mask = mve_element_mask(env);                  \
1195         unsigned e;                                             \
1196         TYPE *m = vm;                                           \
1197         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
1198             if (mask & 1) {                                     \
1199                 ra += m[H##ESIZE(e)];                           \
1200             }                                                   \
1201         }                                                       \
1202         mve_advance_vpt(env);                                   \
1203         return ra;                                              \
1204     }                                                           \
1205 
1206 DO_VADDV(vaddvsb, 1, int8_t)
1207 DO_VADDV(vaddvsh, 2, int16_t)
1208 DO_VADDV(vaddvsw, 4, int32_t)
1209 DO_VADDV(vaddvub, 1, uint8_t)
1210 DO_VADDV(vaddvuh, 2, uint16_t)
1211 DO_VADDV(vaddvuw, 4, uint32_t)
1212 
1213 #define DO_VADDLV(OP, TYPE, LTYPE)                              \
1214     uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
1215                                     uint64_t ra)                \
1216     {                                                           \
1217         uint16_t mask = mve_element_mask(env);                  \
1218         unsigned e;                                             \
1219         TYPE *m = vm;                                           \
1220         for (e = 0; e < 16 / 4; e++, mask >>= 4) {              \
1221             if (mask & 1) {                                     \
1222                 ra += (LTYPE)m[H4(e)];                          \
1223             }                                                   \
1224         }                                                       \
1225         mve_advance_vpt(env);                                   \
1226         return ra;                                              \
1227     }                                                           \
1228 
1229 DO_VADDLV(vaddlv_s, int32_t, int64_t)
1230 DO_VADDLV(vaddlv_u, uint32_t, uint64_t)
1231 
1232 /* Shifts by immediate */
1233 #define DO_2SHIFT(OP, ESIZE, TYPE, FN)                          \
1234     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
1235                                 void *vm, uint32_t shift)       \
1236     {                                                           \
1237         TYPE *d = vd, *m = vm;                                  \
1238         uint16_t mask = mve_element_mask(env);                  \
1239         unsigned e;                                             \
1240         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
1241             mergemask(&d[H##ESIZE(e)],                          \
1242                       FN(m[H##ESIZE(e)], shift), mask);         \
1243         }                                                       \
1244         mve_advance_vpt(env);                                   \
1245     }
1246 
1247 #define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN)                      \
1248     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
1249                                 void *vm, uint32_t shift)       \
1250     {                                                           \
1251         TYPE *d = vd, *m = vm;                                  \
1252         uint16_t mask = mve_element_mask(env);                  \
1253         unsigned e;                                             \
1254         bool qc = false;                                        \
1255         for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) {      \
1256             bool sat = false;                                   \
1257             mergemask(&d[H##ESIZE(e)],                          \
1258                       FN(m[H##ESIZE(e)], shift, &sat), mask);   \
1259             qc |= sat & mask & 1;                               \
1260         }                                                       \
1261         if (qc) {                                               \
1262             env->vfp.qc[0] = qc;                                \
1263         }                                                       \
1264         mve_advance_vpt(env);                                   \
1265     }
1266 
1267 /* provide unsigned 2-op shift helpers for all sizes */
1268 #define DO_2SHIFT_U(OP, FN)                     \
1269     DO_2SHIFT(OP##b, 1, uint8_t, FN)            \
1270     DO_2SHIFT(OP##h, 2, uint16_t, FN)           \
1271     DO_2SHIFT(OP##w, 4, uint32_t, FN)
1272 #define DO_2SHIFT_S(OP, FN)                     \
1273     DO_2SHIFT(OP##b, 1, int8_t, FN)             \
1274     DO_2SHIFT(OP##h, 2, int16_t, FN)            \
1275     DO_2SHIFT(OP##w, 4, int32_t, FN)
1276 
1277 #define DO_2SHIFT_SAT_U(OP, FN)                 \
1278     DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN)        \
1279     DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN)       \
1280     DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN)
1281 #define DO_2SHIFT_SAT_S(OP, FN)                 \
1282     DO_2SHIFT_SAT(OP##b, 1, int8_t, FN)         \
1283     DO_2SHIFT_SAT(OP##h, 2, int16_t, FN)        \
1284     DO_2SHIFT_SAT(OP##w, 4, int32_t, FN)
1285 
1286 DO_2SHIFT_U(vshli_u, DO_VSHLU)
1287 DO_2SHIFT_S(vshli_s, DO_VSHLS)
1288 DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP)
1289 DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
1290 DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
1291 DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
1292 DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
1293 
1294 /* Shift-and-insert; we always work with 64 bits at a time */
1295 #define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN)                    \
1296     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,             \
1297                                 void *vm, uint32_t shift)               \
1298     {                                                                   \
1299         uint64_t *d = vd, *m = vm;                                      \
1300         uint16_t mask;                                                  \
1301         uint64_t shiftmask;                                             \
1302         unsigned e;                                                     \
1303         if (shift == ESIZE * 8) {                                       \
1304             /*                                                          \
1305              * Only VSRI can shift by <dt>; it should mean "don't       \
1306              * update the destination". The generic logic can't handle  \
1307              * this because it would try to shift by an out-of-range    \
1308              * amount, so special case it here.                         \
1309              */                                                         \
1310             goto done;                                                  \
1311         }                                                               \
1312         assert(shift < ESIZE * 8);                                      \
1313         mask = mve_element_mask(env);                                   \
1314         /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */     \
1315         shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift));     \
1316         for (e = 0; e < 16 / 8; e++, mask >>= 8) {                      \
1317             uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) |       \
1318                 (d[H8(e)] & ~shiftmask);                                \
1319             mergemask(&d[H8(e)], r, mask);                              \
1320         }                                                               \
1321 done:                                                                   \
1322         mve_advance_vpt(env);                                           \
1323     }
1324 
1325 #define DO_SHL(N, SHIFT) ((N) << (SHIFT))
1326 #define DO_SHR(N, SHIFT) ((N) >> (SHIFT))
1327 #define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
1328 #define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
1329 
1330 DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK)
1331 DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK)
1332 DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK)
1333 DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK)
1334 DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK)
1335 DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK)
1336 
1337 /*
1338  * Long shifts taking half-sized inputs from top or bottom of the input
1339  * vector and producing a double-width result. ESIZE, TYPE are for
1340  * the input, and LESIZE, LTYPE for the output.
1341  * Unlike the normal shift helpers, we do not handle negative shift counts,
1342  * because the long shift is strictly left-only.
1343  */
1344 #define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE)                   \
1345     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,             \
1346                                 void *vm, uint32_t shift)               \
1347     {                                                                   \
1348         LTYPE *d = vd;                                                  \
1349         TYPE *m = vm;                                                   \
1350         uint16_t mask = mve_element_mask(env);                          \
1351         unsigned le;                                                    \
1352         assert(shift <= 16);                                            \
1353         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) {         \
1354             LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift;        \
1355             mergemask(&d[H##LESIZE(le)], r, mask);                      \
1356         }                                                               \
1357         mve_advance_vpt(env);                                           \
1358     }
1359 
1360 #define DO_VSHLL_ALL(OP, TOP)                                \
1361     DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t)             \
1362     DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t)           \
1363     DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t)            \
1364     DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t)          \
1365 
1366 DO_VSHLL_ALL(vshllb, false)
1367 DO_VSHLL_ALL(vshllt, true)
1368 
1369 /*
1370  * Narrowing right shifts, taking a double sized input, shifting it
1371  * and putting the result in either the top or bottom half of the output.
1372  * ESIZE, TYPE are the output, and LESIZE, LTYPE the input.
1373  */
1374 #define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)       \
1375     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
1376                                 void *vm, uint32_t shift)       \
1377     {                                                           \
1378         LTYPE *m = vm;                                          \
1379         TYPE *d = vd;                                           \
1380         uint16_t mask = mve_element_mask(env);                  \
1381         unsigned le;                                            \
1382         mask >>= ESIZE * TOP;                                   \
1383         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
1384             TYPE r = FN(m[H##LESIZE(le)], shift);               \
1385             mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);     \
1386         }                                                       \
1387         mve_advance_vpt(env);                                   \
1388     }
1389 
1390 #define DO_VSHRN_ALL(OP, FN)                                    \
1391     DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN)        \
1392     DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN)       \
1393     DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN)         \
1394     DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN)
1395 
1396 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
1397 {
1398     if (likely(sh < 64)) {
1399         return (x >> sh) + ((x >> (sh - 1)) & 1);
1400     } else if (sh == 64) {
1401         return x >> 63;
1402     } else {
1403         return 0;
1404     }
1405 }
1406 
1407 static inline int64_t do_srshr(int64_t x, unsigned sh)
1408 {
1409     if (likely(sh < 64)) {
1410         return (x >> sh) + ((x >> (sh - 1)) & 1);
1411     } else {
1412         /* Rounding the sign bit always produces 0. */
1413         return 0;
1414     }
1415 }
1416 
1417 DO_VSHRN_ALL(vshrn, DO_SHR)
1418 DO_VSHRN_ALL(vrshrn, do_urshr)
1419 
1420 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max,
1421                                  bool *satp)
1422 {
1423     if (val > max) {
1424         *satp = true;
1425         return max;
1426     } else if (val < min) {
1427         *satp = true;
1428         return min;
1429     } else {
1430         return val;
1431     }
1432 }
1433 
1434 /* Saturating narrowing right shifts */
1435 #define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN)   \
1436     void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd,     \
1437                                 void *vm, uint32_t shift)       \
1438     {                                                           \
1439         LTYPE *m = vm;                                          \
1440         TYPE *d = vd;                                           \
1441         uint16_t mask = mve_element_mask(env);                  \
1442         bool qc = false;                                        \
1443         unsigned le;                                            \
1444         mask >>= ESIZE * TOP;                                   \
1445         for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
1446             bool sat = false;                                   \
1447             TYPE r = FN(m[H##LESIZE(le)], shift, &sat);         \
1448             mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask);     \
1449             qc |= sat & mask & 1;                               \
1450         }                                                       \
1451         if (qc) {                                               \
1452             env->vfp.qc[0] = qc;                                \
1453         }                                                       \
1454         mve_advance_vpt(env);                                   \
1455     }
1456 
1457 #define DO_VSHRN_SAT_UB(BOP, TOP, FN)                           \
1458     DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN)       \
1459     DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
1460 
1461 #define DO_VSHRN_SAT_UH(BOP, TOP, FN)                           \
1462     DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN)      \
1463     DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
1464 
1465 #define DO_VSHRN_SAT_SB(BOP, TOP, FN)                           \
1466     DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN)         \
1467     DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
1468 
1469 #define DO_VSHRN_SAT_SH(BOP, TOP, FN)                           \
1470     DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN)        \
1471     DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
1472 
1473 #define DO_SHRN_SB(N, M, SATP)                                  \
1474     do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP)
1475 #define DO_SHRN_UB(N, M, SATP)                                  \
1476     do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP)
1477 #define DO_SHRUN_B(N, M, SATP)                                  \
1478     do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP)
1479 
1480 #define DO_SHRN_SH(N, M, SATP)                                  \
1481     do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP)
1482 #define DO_SHRN_UH(N, M, SATP)                                  \
1483     do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP)
1484 #define DO_SHRUN_H(N, M, SATP)                                  \
1485     do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP)
1486 
1487 #define DO_RSHRN_SB(N, M, SATP)                                 \
1488     do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP)
1489 #define DO_RSHRN_UB(N, M, SATP)                                 \
1490     do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP)
1491 #define DO_RSHRUN_B(N, M, SATP)                                 \
1492     do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP)
1493 
1494 #define DO_RSHRN_SH(N, M, SATP)                                 \
1495     do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP)
1496 #define DO_RSHRN_UH(N, M, SATP)                                 \
1497     do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP)
1498 #define DO_RSHRUN_H(N, M, SATP)                                 \
1499     do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP)
1500 
1501 DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB)
1502 DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH)
1503 DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB)
1504 DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH)
1505 DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B)
1506 DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H)
1507 
1508 DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB)
1509 DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH)
1510 DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB)
1511 DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH)
1512 DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B)
1513 DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H)
1514 
1515 uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
1516                            uint32_t shift)
1517 {
1518     uint32_t *d = vd;
1519     uint16_t mask = mve_element_mask(env);
1520     unsigned e;
1521     uint32_t r;
1522 
1523     /*
1524      * For each 32-bit element, we shift it left, bringing in the
1525      * low 'shift' bits of rdm at the bottom. Bits shifted out at
1526      * the top become the new rdm, if the predicate mask permits.
1527      * The final rdm value is returned to update the register.
1528      * shift == 0 here means "shift by 32 bits".
1529      */
1530     if (shift == 0) {
1531         for (e = 0; e < 16 / 4; e++, mask >>= 4) {
1532             r = rdm;
1533             if (mask & 1) {
1534                 rdm = d[H4(e)];
1535             }
1536             mergemask(&d[H4(e)], r, mask);
1537         }
1538     } else {
1539         uint32_t shiftmask = MAKE_64BIT_MASK(0, shift);
1540 
1541         for (e = 0; e < 16 / 4; e++, mask >>= 4) {
1542             r = (d[H4(e)] << shift) | (rdm & shiftmask);
1543             if (mask & 1) {
1544                 rdm = d[H4(e)] >> (32 - shift);
1545             }
1546             mergemask(&d[H4(e)], r, mask);
1547         }
1548     }
1549     mve_advance_vpt(env);
1550     return rdm;
1551 }
1552 
1553 uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
1554 {
1555     return do_sqrshl_d(n, -(int8_t)shift, false, NULL);
1556 }
1557 
1558 uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift)
1559 {
1560     return do_uqrshl_d(n, (int8_t)shift, false, NULL);
1561 }
1562 
1563 uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
1564 {
1565     return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);
1566 }
1567 
1568 uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
1569 {
1570     return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);
1571 }
1572 
1573 uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
1574 {
1575     return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);
1576 }
1577 
1578 uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift)
1579 {
1580     return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);
1581 }
1582 
1583 /* Operate on 64-bit values, but saturate at 48 bits */
1584 static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift,
1585                                     bool round, uint32_t *sat)
1586 {
1587     int64_t val, extval;
1588 
1589     if (shift <= -48) {
1590         /* Rounding the sign bit always produces 0. */
1591         if (round) {
1592             return 0;
1593         }
1594         return src >> 63;
1595     } else if (shift < 0) {
1596         if (round) {
1597             src >>= -shift - 1;
1598             val = (src >> 1) + (src & 1);
1599         } else {
1600             val = src >> -shift;
1601         }
1602         extval = sextract64(val, 0, 48);
1603         if (!sat || val == extval) {
1604             return extval;
1605         }
1606     } else if (shift < 48) {
1607         int64_t extval = sextract64(src << shift, 0, 48);
1608         if (!sat || src == (extval >> shift)) {
1609             return extval;
1610         }
1611     } else if (!sat || src == 0) {
1612         return 0;
1613     }
1614 
1615     *sat = 1;
1616     return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17);
1617 }
1618 
1619 /* Operate on 64-bit values, but saturate at 48 bits */
1620 static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift,
1621                                      bool round, uint32_t *sat)
1622 {
1623     uint64_t val, extval;
1624 
1625     if (shift <= -(48 + round)) {
1626         return 0;
1627     } else if (shift < 0) {
1628         if (round) {
1629             val = src >> (-shift - 1);
1630             val = (val >> 1) + (val & 1);
1631         } else {
1632             val = src >> -shift;
1633         }
1634         extval = extract64(val, 0, 48);
1635         if (!sat || val == extval) {
1636             return extval;
1637         }
1638     } else if (shift < 48) {
1639         uint64_t extval = extract64(src << shift, 0, 48);
1640         if (!sat || src == (extval >> shift)) {
1641             return extval;
1642         }
1643     } else if (!sat || src == 0) {
1644         return 0;
1645     }
1646 
1647     *sat = 1;
1648     return MAKE_64BIT_MASK(0, 48);
1649 }
1650 
1651 uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift)
1652 {
1653     return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);
1654 }
1655 
1656 uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift)
1657 {
1658     return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);
1659 }
1660 
1661 uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
1662 {
1663     return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
1664 }
1665 
1666 uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
1667 {
1668     return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
1669 }
1670 
1671 uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift)
1672 {
1673     return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);
1674 }
1675 
1676 uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift)
1677 {
1678     return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);
1679 }
1680