xref: /qemu/target/arm/tcg/sve_helper.c (revision df6fe2abf2e990f767ce755d426bc439c7bba336)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/page-protection.h"
24 #include "exec/helper-proto.h"
25 #include "exec/target_page.h"
26 #include "exec/tlb-flags.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg.h"
30 #include "vec_internal.h"
31 #include "sve_ldst_internal.h"
32 #include "accel/tcg/cpu-ldst.h"
33 #include "accel/tcg/helper-retaddr.h"
34 #include "accel/tcg/cpu-ops.h"
35 #include "accel/tcg/probe.h"
36 #ifdef CONFIG_USER_ONLY
37 #include "user/page-protection.h"
38 #endif
39 
40 
41 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
42  *
43  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
44  * and bit 0 set if C is set.  Compare the definitions of these variables
45  * within CPUARMState.
46  */
47 
48 /* For no G bits set, NZCV = C.  */
49 #define PREDTEST_INIT  1
50 
51 /* This is an iterative function, called for each Pd and Pg word
52  * moving forward.
53  */
54 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
55 {
56     if (likely(g)) {
57         /* Compute N from first D & G.
58            Use bit 2 to signal first G bit seen.  */
59         if (!(flags & 4)) {
60             flags |= ((d & (g & -g)) != 0) << 31;
61             flags |= 4;
62         }
63 
64         /* Accumulate Z from each D & G.  */
65         flags |= ((d & g) != 0) << 1;
66 
67         /* Compute C from last !(D & G).  Replace previous.  */
68         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
69     }
70     return flags;
71 }
72 
73 /* This is an iterative function, called for each Pd and Pg word
74  * moving backward.
75  */
76 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
77 {
78     if (likely(g)) {
79         /* Compute C from first (i.e last) !(D & G).
80            Use bit 2 to signal first G bit seen.  */
81         if (!(flags & 4)) {
82             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
83             flags |= (d & pow2floor(g)) == 0;
84         }
85 
86         /* Accumulate Z from each D & G.  */
87         flags |= ((d & g) != 0) << 1;
88 
89         /* Compute N from last (i.e first) D & G.  Replace previous.  */
90         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
91     }
92     return flags;
93 }
94 
95 /* The same for a single word predicate.  */
96 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
97 {
98     return iter_predtest_fwd(d, g, PREDTEST_INIT);
99 }
100 
101 /* The same for a multi-word predicate.  */
102 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
103 {
104     uint32_t flags = PREDTEST_INIT;
105     uint64_t *d = vd, *g = vg;
106     uintptr_t i = 0;
107 
108     do {
109         flags = iter_predtest_fwd(d[i], g[i], flags);
110     } while (++i < words);
111 
112     return flags;
113 }
114 
115 /* Similarly for single word elements.  */
116 static inline uint64_t expand_pred_s(uint8_t byte)
117 {
118     static const uint64_t word[] = {
119         [0x01] = 0x00000000ffffffffull,
120         [0x10] = 0xffffffff00000000ull,
121         [0x11] = 0xffffffffffffffffull,
122     };
123     return word[byte & 0x11];
124 }
125 
126 static inline uint64_t expand_pred_d(uint8_t byte)
127 {
128     return -(uint64_t)(byte & 1);
129 }
130 
131 #define LOGICAL_PPPP(NAME, FUNC) \
132 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
133 {                                                                         \
134     uintptr_t opr_sz = simd_oprsz(desc);                                  \
135     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
136     uintptr_t i;                                                          \
137     for (i = 0; i < opr_sz / 8; ++i) {                                    \
138         d[i] = FUNC(n[i], m[i], g[i]);                                    \
139     }                                                                     \
140 }
141 
142 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
143 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
144 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
145 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
146 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
147 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
148 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
149 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
150 
151 LOGICAL_PPPP(sve_and_pppp, DO_AND)
152 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
153 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
154 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
155 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
156 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
157 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
158 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
159 
160 #undef DO_AND
161 #undef DO_BIC
162 #undef DO_EOR
163 #undef DO_ORR
164 #undef DO_ORN
165 #undef DO_NOR
166 #undef DO_NAND
167 #undef DO_SEL
168 #undef LOGICAL_PPPP
169 
170 /* Fully general three-operand expander, controlled by a predicate.
171  * This is complicated by the host-endian storage of the register file.
172  */
173 /* ??? I don't expect the compiler could ever vectorize this itself.
174  * With some tables we can convert bit masks to byte masks, and with
175  * extra care wrt byte/word ordering we could use gcc generic vectors
176  * and do 16 bytes at a time.
177  */
178 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
179 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
180 {                                                                       \
181     intptr_t i, opr_sz = simd_oprsz(desc);                              \
182     for (i = 0; i < opr_sz; ) {                                         \
183         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
184         do {                                                            \
185             if (pg & 1) {                                               \
186                 TYPE nn = *(TYPE *)(vn + H(i));                         \
187                 TYPE mm = *(TYPE *)(vm + H(i));                         \
188                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
189             }                                                           \
190             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
191         } while (i & 15);                                               \
192     }                                                                   \
193 }
194 
195 /* Similarly, specialized for 64-bit operands.  */
196 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
197 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
198 {                                                               \
199     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
200     TYPE *d = vd, *n = vn, *m = vm;                             \
201     uint8_t *pg = vg;                                           \
202     for (i = 0; i < opr_sz; i += 1) {                           \
203         if (pg[H1(i)] & 1) {                                    \
204             TYPE nn = n[i], mm = m[i];                          \
205             d[i] = OP(nn, mm);                                  \
206         }                                                       \
207     }                                                           \
208 }
209 
210 #define DO_AND(N, M)  (N & M)
211 #define DO_EOR(N, M)  (N ^ M)
212 #define DO_ORR(N, M)  (N | M)
213 #define DO_BIC(N, M)  (N & ~M)
214 #define DO_ORC(N, M)  (N | ~M)
215 #define DO_ADD(N, M)  (N + M)
216 #define DO_SUB(N, M)  (N - M)
217 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
218 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
219 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
220 #define DO_MUL(N, M)  (N * M)
221 
222 
223 /*
224  * We must avoid the C undefined behaviour cases: division by
225  * zero and signed division of INT_MIN by -1. Both of these
226  * have architecturally defined required results for Arm.
227  * We special case all signed divisions by -1 to avoid having
228  * to deduce the minimum integer for the type involved.
229  */
230 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
231 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
232 
233 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
234 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
235 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
236 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
237 
238 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
239 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
240 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
241 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
242 
243 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
244 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
245 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
246 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
247 
248 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
249 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
250 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
251 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
252 
253 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
254 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
255 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
256 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
257 
258 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
259 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
260 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
261 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
262 
263 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
264 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
265 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
266 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
267 
268 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
269 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
270 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
271 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
272 
273 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
274 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
275 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
276 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
277 
278 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
279 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
280 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
281 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
282 
283 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
284 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
285 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
286 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
287 
288 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
289 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
290 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
291 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
292 
293 /* Because the computation type is at least twice as large as required,
294    these work for both signed and unsigned source types.  */
295 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
296 {
297     return (n * m) >> 8;
298 }
299 
300 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
301 {
302     return (n * m) >> 16;
303 }
304 
305 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
306 {
307     return (n * m) >> 32;
308 }
309 
310 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
311 {
312     uint64_t lo, hi;
313     muls64(&lo, &hi, n, m);
314     return hi;
315 }
316 
317 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
318 {
319     uint64_t lo, hi;
320     mulu64(&lo, &hi, n, m);
321     return hi;
322 }
323 
324 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
325 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
326 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
327 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
328 
329 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
330 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
331 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
332 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
333 
334 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
335 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
336 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
337 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
338 
339 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
340 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
341 
342 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
343 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
344 
345 /* Note that all bits of the shift are significant
346    and not modulo the element size.  */
347 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
348 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
349 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
350 
351 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
352 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
353 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
354 
355 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
356 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
357 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
358 
359 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
360 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
361 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
362 
363 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
364 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
365 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
366 
367 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
368 {
369     int8_t n1 = n, n2 = n >> 8;
370     return m + n1 + n2;
371 }
372 
373 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
374 {
375     int16_t n1 = n, n2 = n >> 16;
376     return m + n1 + n2;
377 }
378 
379 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
380 {
381     int32_t n1 = n, n2 = n >> 32;
382     return m + n1 + n2;
383 }
384 
385 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
386 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
387 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
388 
389 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
390 {
391     uint8_t n1 = n, n2 = n >> 8;
392     return m + n1 + n2;
393 }
394 
395 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
396 {
397     uint16_t n1 = n, n2 = n >> 16;
398     return m + n1 + n2;
399 }
400 
401 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
402 {
403     uint32_t n1 = n, n2 = n >> 32;
404     return m + n1 + n2;
405 }
406 
407 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
408 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
409 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
410 
411 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
412 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
413 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
414 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
415 
416 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
417 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
418 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
419 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
420 
421 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
422 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
423 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
424 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
425 
426 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
427 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
428 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
429 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
430 
431 /*
432  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
433  * We pass in a pointer to a dummy saturation field to trigger
434  * the saturating arithmetic but discard the information about
435  * whether it has occurred.
436  */
437 #define do_sqshl_b(n, m) \
438    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
439 #define do_sqshl_h(n, m) \
440    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
441 #define do_sqshl_s(n, m) \
442    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
443 #define do_sqshl_d(n, m) \
444    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
445 
446 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
447 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
448 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
449 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
450 
451 #define do_uqshl_b(n, m) \
452    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
453 #define do_uqshl_h(n, m) \
454    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
455 #define do_uqshl_s(n, m) \
456    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
457 #define do_uqshl_d(n, m) \
458    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
459 
460 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
461 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
462 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
463 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
464 
465 #define do_sqrshl_b(n, m) \
466    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
467 #define do_sqrshl_h(n, m) \
468    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
469 #define do_sqrshl_s(n, m) \
470    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
471 #define do_sqrshl_d(n, m) \
472    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
473 
474 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
475 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
476 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
477 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
478 
479 #undef do_sqrshl_d
480 
481 #define do_uqrshl_b(n, m) \
482    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
483 #define do_uqrshl_h(n, m) \
484    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
485 #define do_uqrshl_s(n, m) \
486    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
487 #define do_uqrshl_d(n, m) \
488    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
489 
490 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
491 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
492 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
493 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
494 
495 #undef do_uqrshl_d
496 
497 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
498 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
499 
500 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
501 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
502 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
503 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
504 
505 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
506 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
507 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
508 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
509 
510 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
511 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
512 
513 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
514 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
515 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
516 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
517 
518 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
519 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
520 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
521 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
522 
523 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
524 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
525 
526 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
527 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
528 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
529 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
530 
531 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
532 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
533 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
534 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
535 
536 #define DO_SQADD_B(n, m) do_ssat_b((int64_t)n + m)
537 #define DO_SQADD_H(n, m) do_ssat_h((int64_t)n + m)
538 #define DO_SQADD_S(n, m) do_ssat_s((int64_t)n + m)
539 
540 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
541 {
542     int64_t r = n + m;
543     if (((r ^ n) & ~(n ^ m)) < 0) {
544         /* Signed overflow.  */
545         return r < 0 ? INT64_MAX : INT64_MIN;
546     }
547     return r;
548 }
549 
550 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
551 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
552 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
553 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
554 
555 #define DO_UQADD_B(n, m) do_usat_b((int64_t)n + m)
556 #define DO_UQADD_H(n, m) do_usat_h((int64_t)n + m)
557 #define DO_UQADD_S(n, m) do_usat_s((int64_t)n + m)
558 
559 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
560 {
561     uint64_t r = n + m;
562     return r < n ? UINT64_MAX : r;
563 }
564 
565 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
566 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
567 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
568 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
569 
570 #define DO_SQSUB_B(n, m) do_ssat_b((int64_t)n - m)
571 #define DO_SQSUB_H(n, m) do_ssat_h((int64_t)n - m)
572 #define DO_SQSUB_S(n, m) do_ssat_s((int64_t)n - m)
573 
574 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
575 {
576     int64_t r = n - m;
577     if (((r ^ n) & (n ^ m)) < 0) {
578         /* Signed overflow.  */
579         return r < 0 ? INT64_MAX : INT64_MIN;
580     }
581     return r;
582 }
583 
584 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
585 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
586 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
587 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
588 
589 #define DO_UQSUB_B(n, m) do_usat_b((int64_t)n - m)
590 #define DO_UQSUB_H(n, m) do_usat_h((int64_t)n - m)
591 #define DO_UQSUB_S(n, m) do_usat_s((int64_t)n - m)
592 
593 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
594 {
595     return n > m ? n - m : 0;
596 }
597 
598 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
599 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
600 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
601 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
602 
603 #define DO_SUQADD_B(n, m) do_ssat_b((int64_t)(int8_t)n + m)
604 #define DO_SUQADD_H(n, m) do_ssat_h((int64_t)(int16_t)n + m)
605 #define DO_SUQADD_S(n, m) do_ssat_s((int64_t)(int32_t)n + m)
606 
607 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
608 {
609     uint64_t r = n + m;
610 
611     if (n < 0) {
612         /* Note that m - abs(n) cannot underflow. */
613         if (r > INT64_MAX) {
614             /* Result is either very large positive or negative. */
615             if (m > -n) {
616                 /* m > abs(n), so r is a very large positive. */
617                 return INT64_MAX;
618             }
619             /* Result is negative. */
620         }
621     } else {
622         /* Both inputs are positive: check for overflow.  */
623         if (r < m || r > INT64_MAX) {
624             return INT64_MAX;
625         }
626     }
627     return r;
628 }
629 
630 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
631 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
632 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
633 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
634 
635 #define DO_USQADD_B(n, m) do_usat_b((int64_t)n + (int8_t)m)
636 #define DO_USQADD_H(n, m) do_usat_h((int64_t)n + (int16_t)m)
637 #define DO_USQADD_S(n, m) do_usat_s((int64_t)n + (int32_t)m)
638 
639 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
640 {
641     uint64_t r = n + m;
642 
643     if (m < 0) {
644         return n < -m ? 0 : r;
645     }
646     return r < n ? UINT64_MAX : r;
647 }
648 
649 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
650 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
651 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
652 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
653 
654 #undef DO_ZPZZ
655 #undef DO_ZPZZ_D
656 
657 /*
658  * Three operand expander, operating on element pairs.
659  * If the slot I is even, the elements from from VN {I, I+1}.
660  * If the slot I is odd, the elements from from VM {I-1, I}.
661  * Load all of the input elements in each pair before overwriting output.
662  */
663 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
664 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
665 {                                                               \
666     intptr_t i, opr_sz = simd_oprsz(desc);                      \
667     for (i = 0; i < opr_sz; ) {                                 \
668         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
669         do {                                                    \
670             TYPE n0 = *(TYPE *)(vn + H(i));                     \
671             TYPE m0 = *(TYPE *)(vm + H(i));                     \
672             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
673             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
674             if (pg & 1) {                                       \
675                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
676             }                                                   \
677             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
678             if (pg & 1) {                                       \
679                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
680             }                                                   \
681             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
682         } while (i & 15);                                       \
683     }                                                           \
684 }
685 
686 /* Similarly, specialized for 64-bit operands.  */
687 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
688 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
689 {                                                               \
690     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
691     TYPE *d = vd, *n = vn, *m = vm;                             \
692     uint8_t *pg = vg;                                           \
693     for (i = 0; i < opr_sz; i += 2) {                           \
694         TYPE n0 = n[i], n1 = n[i + 1];                          \
695         TYPE m0 = m[i], m1 = m[i + 1];                          \
696         if (pg[H1(i)] & 1) {                                    \
697             d[i] = OP(n0, n1);                                  \
698         }                                                       \
699         if (pg[H1(i + 1)] & 1) {                                \
700             d[i + 1] = OP(m0, m1);                              \
701         }                                                       \
702     }                                                           \
703 }
704 
705 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
706 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
707 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
708 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
709 
710 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
713 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
714 
715 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
718 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
719 
720 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
723 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
724 
725 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
728 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
729 
730 #undef DO_ZPZZ_PAIR
731 #undef DO_ZPZZ_PAIR_D
732 
733 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
734 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
735                   float_status *status, uint32_t desc)                  \
736 {                                                                       \
737     intptr_t i, opr_sz = simd_oprsz(desc);                              \
738     for (i = 0; i < opr_sz; ) {                                         \
739         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
740         do {                                                            \
741             TYPE n0 = *(TYPE *)(vn + H(i));                             \
742             TYPE m0 = *(TYPE *)(vm + H(i));                             \
743             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
744             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
745             if (pg & 1) {                                               \
746                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
747             }                                                           \
748             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
749             if (pg & 1) {                                               \
750                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
751             }                                                           \
752             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
753         } while (i & 15);                                               \
754     }                                                                   \
755 }
756 
757 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
760 
761 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
764 
765 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
768 
769 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
772 
773 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
776 
777 #undef DO_ZPZZ_PAIR_FP
778 
779 /* Three-operand expander, controlled by a predicate, in which the
780  * third operand is "wide".  That is, for D = N op M, the same 64-bit
781  * value of M is used with all of the narrower values of N.
782  */
783 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
784 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
785 {                                                                       \
786     intptr_t i, opr_sz = simd_oprsz(desc);                              \
787     for (i = 0; i < opr_sz; ) {                                         \
788         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
789         TYPEW mm = *(TYPEW *)(vm + i);                                  \
790         do {                                                            \
791             if (pg & 1) {                                               \
792                 TYPE nn = *(TYPE *)(vn + H(i));                         \
793                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
794             }                                                           \
795             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
796         } while (i & 7);                                                \
797     }                                                                   \
798 }
799 
800 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
801 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
802 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
803 
804 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
805 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
806 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
807 
808 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
809 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
810 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
811 
812 #undef DO_ZPZW
813 
814 /* Fully general two-operand expander, controlled by a predicate.
815  */
816 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
817 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
818 {                                                               \
819     intptr_t i, opr_sz = simd_oprsz(desc);                      \
820     for (i = 0; i < opr_sz; ) {                                 \
821         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
822         do {                                                    \
823             if (pg & 1) {                                       \
824                 TYPE nn = *(TYPE *)(vn + H(i));                 \
825                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
826             }                                                   \
827             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
828         } while (i & 15);                                       \
829     }                                                           \
830 }
831 
832 /* Similarly, specialized for 64-bit operands.  */
833 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
834 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
835 {                                                               \
836     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
837     TYPE *d = vd, *n = vn;                                      \
838     uint8_t *pg = vg;                                           \
839     for (i = 0; i < opr_sz; i += 1) {                           \
840         if (pg[H1(i)] & 1) {                                    \
841             TYPE nn = n[i];                                     \
842             d[i] = OP(nn);                                      \
843         }                                                       \
844     }                                                           \
845 }
846 
847 #define DO_CLS_B(N)   (clrsb32(N) - 24)
848 #define DO_CLS_H(N)   (clrsb32(N) - 16)
849 
850 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
851 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
852 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
853 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
854 
855 #define DO_CLZ_B(N)   (clz32(N) - 24)
856 #define DO_CLZ_H(N)   (clz32(N) - 16)
857 
858 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
859 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
860 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
861 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
862 
863 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
864 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
865 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
866 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
867 
868 #define DO_CNOT(N)    (N == 0)
869 
870 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
871 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
872 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
873 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
874 
875 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
876 
877 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
878 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
879 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
880 
881 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
882 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
883 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
884 
885 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
886 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
887 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
888 
889 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
890 
891 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
892 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
893 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
894 
895 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
896 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
897 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
898 
899 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
900 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
901 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
902 
903 #define DO_NOT(N)    (~N)
904 
905 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
906 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
907 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
908 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
909 
910 #define DO_SXTB(N)    ((int8_t)N)
911 #define DO_SXTH(N)    ((int16_t)N)
912 #define DO_SXTS(N)    ((int32_t)N)
913 #define DO_UXTB(N)    ((uint8_t)N)
914 #define DO_UXTH(N)    ((uint16_t)N)
915 #define DO_UXTS(N)    ((uint32_t)N)
916 
917 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
918 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
919 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
920 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
921 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
922 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
923 
924 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
925 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
926 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
927 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
928 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
929 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
930 
931 #define DO_ABS(N)    (N < 0 ? -N : N)
932 
933 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
934 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
935 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
936 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
937 
938 #define DO_NEG(N)    (-N)
939 
940 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
941 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
942 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
943 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
944 
945 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
946 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
947 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
948 
949 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
950 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
951 
952 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
953 
954 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
955 {
956     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
957     uint64_t *d = vd, *n = vn;
958     uint8_t *pg = vg;
959 
960     for (i = 0; i < opr_sz; i += 2) {
961         if (pg[H1(i)] & 1) {
962             uint64_t n0 = n[i + 0];
963             uint64_t n1 = n[i + 1];
964             d[i + 0] = n1;
965             d[i + 1] = n0;
966         }
967     }
968 }
969 
970 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
971 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
972 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
973 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
974 
975 #define DO_SQABS(X) \
976     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
977        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
978 
979 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
980 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
981 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
982 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
983 
984 #define DO_SQNEG(X) \
985     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
986        x_ == min_ ? -min_ - 1 : -x_; })
987 
988 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
989 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
990 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
991 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
992 
993 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
994 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
995 
996 /* Three-operand expander, unpredicated, in which the third operand is "wide".
997  */
998 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
999 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1000 {                                                              \
1001     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1002     for (i = 0; i < opr_sz; ) {                                \
1003         TYPEW mm = *(TYPEW *)(vm + i);                         \
1004         do {                                                   \
1005             TYPE nn = *(TYPE *)(vn + H(i));                    \
1006             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
1007             i += sizeof(TYPE);                                 \
1008         } while (i & 7);                                       \
1009     }                                                          \
1010 }
1011 
1012 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1013 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1014 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1015 
1016 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1017 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1018 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1019 
1020 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1021 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1022 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1023 
1024 #undef DO_ZZW
1025 
1026 #undef DO_CLS_B
1027 #undef DO_CLS_H
1028 #undef DO_CLZ_B
1029 #undef DO_CLZ_H
1030 #undef DO_CNOT
1031 #undef DO_FABS
1032 #undef DO_FNEG
1033 #undef DO_ABS
1034 #undef DO_NEG
1035 #undef DO_ZPZ
1036 #undef DO_ZPZ_D
1037 
1038 /*
1039  * Three-operand expander, unpredicated, in which the two inputs are
1040  * selected from the top or bottom half of the wide column.
1041  */
1042 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1043 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1044 {                                                                       \
1045     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1046     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1047     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1048     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1049         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1050         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1051         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1052     }                                                                   \
1053 }
1054 
1055 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1056 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1057 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1058 
1059 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1060 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1061 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1062 
1063 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1064 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1065 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1066 
1067 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1068 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1069 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1070 
1071 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1072 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1073 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1074 
1075 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1076 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1077 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1078 
1079 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1080 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1081 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1082 
1083 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1084 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1085 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1086 
1087 /* Note that the multiply cannot overflow, but the doubling can. */
1088 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1089 {
1090     int16_t val = n * m;
1091     return DO_SQADD_H(val, val);
1092 }
1093 
1094 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1095 {
1096     int32_t val = n * m;
1097     return DO_SQADD_S(val, val);
1098 }
1099 
1100 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1101 {
1102     int64_t val = n * m;
1103     return do_sqadd_d(val, val);
1104 }
1105 
1106 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1107 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1108 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1109 
1110 #undef DO_ZZZ_TB
1111 
1112 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1113 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1114 {                                                              \
1115     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1116     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1117     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1118         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1119         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1120         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1121     }                                                          \
1122 }
1123 
1124 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1125 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1126 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1127 
1128 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1129 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1130 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1131 
1132 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1133 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1134 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1135 
1136 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1137 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1138 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1139 
1140 #undef DO_ZZZ_WTB
1141 
1142 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1143 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1144 {                                                                       \
1145     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1146     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1147     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1148     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1149         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1150         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1151         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1152     }                                                                   \
1153 }
1154 
1155 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1156 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1157 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1158 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1159 
1160 #undef DO_ZZZ_NTB
1161 
1162 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1163 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1164 {                                                               \
1165     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1166     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1167     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1168         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1169         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1170         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1171         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1172     }                                                           \
1173 }
1174 
1175 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1176 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1177 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1178 
1179 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1180 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1181 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1182 
1183 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1184 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1185 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1186 
1187 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1188 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1189 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1190 
1191 #define DO_NMUL(N, M)  -(N * M)
1192 
1193 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1196 
1197 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1200 
1201 #undef DO_ZZZW_ACC
1202 
1203 #define DO_XTNB(NAME, TYPE, OP) \
1204 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1205 {                                                            \
1206     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1207     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1208         TYPE nn = *(TYPE *)(vn + i);                         \
1209         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1210         *(TYPE *)(vd + i) = nn;                              \
1211     }                                                        \
1212 }
1213 
1214 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1215 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1216 {                                                                       \
1217     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1218     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1219         TYPE nn = *(TYPE *)(vn + i);                                    \
1220         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1221     }                                                                   \
1222 }
1223 
1224 DO_XTNB(sve2_sqxtnb_h, int16_t, do_ssat_b)
1225 DO_XTNB(sve2_sqxtnb_s, int32_t, do_ssat_h)
1226 DO_XTNB(sve2_sqxtnb_d, int64_t, do_ssat_s)
1227 
1228 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, do_ssat_b)
1229 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, do_ssat_h)
1230 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, do_ssat_s)
1231 
1232 DO_XTNB(sve2_uqxtnb_h, uint16_t, do_usat_b)
1233 DO_XTNB(sve2_uqxtnb_s, uint32_t, do_usat_h)
1234 DO_XTNB(sve2_uqxtnb_d, uint64_t, do_usat_s)
1235 
1236 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, do_usat_b)
1237 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, do_usat_h)
1238 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, do_usat_s)
1239 
1240 DO_XTNB(sve2_sqxtunb_h, int16_t, do_usat_b)
1241 DO_XTNB(sve2_sqxtunb_s, int32_t, do_usat_h)
1242 DO_XTNB(sve2_sqxtunb_d, int64_t, do_usat_s)
1243 
1244 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, do_usat_b)
1245 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, do_usat_h)
1246 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, do_usat_s)
1247 
1248 #undef DO_XTNB
1249 #undef DO_XTNT
1250 
1251 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1252 {
1253     intptr_t i, opr_sz = simd_oprsz(desc);
1254     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1255     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1256     uint32_t *a = va, *n = vn;
1257     uint64_t *d = vd, *m = vm;
1258 
1259     for (i = 0; i < opr_sz / 8; ++i) {
1260         uint32_t e1 = a[2 * i + H4(0)];
1261         uint32_t e2 = n[2 * i + sel] ^ inv;
1262         uint64_t c = extract64(m[i], 32, 1);
1263         /* Compute and store the entire 33-bit result at once. */
1264         d[i] = c + e1 + e2;
1265     }
1266 }
1267 
1268 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1269 {
1270     intptr_t i, opr_sz = simd_oprsz(desc);
1271     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1272     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1273     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1274 
1275     for (i = 0; i < opr_sz / 8; i += 2) {
1276         Int128 e1 = int128_make64(a[i]);
1277         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1278         Int128 c = int128_make64(m[i + 1] & 1);
1279         Int128 r = int128_add(int128_add(e1, e2), c);
1280         d[i + 0] = int128_getlo(r);
1281         d[i + 1] = int128_gethi(r);
1282     }
1283 }
1284 
1285 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1287 {                                                                       \
1288     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1289     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1290     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1291     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1292         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1293         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1294         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1295         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1296     }                                                                   \
1297 }
1298 
1299 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1300            do_sqdmull_h, DO_SQADD_H)
1301 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1302            do_sqdmull_s, DO_SQADD_S)
1303 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1304            do_sqdmull_d, do_sqadd_d)
1305 
1306 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1307            do_sqdmull_h, DO_SQSUB_H)
1308 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1309            do_sqdmull_s, DO_SQSUB_S)
1310 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1311            do_sqdmull_d, do_sqsub_d)
1312 
1313 #undef DO_SQDMLAL
1314 
1315 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1316 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1317 {                                                               \
1318     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1319     int rot = simd_data(desc);                                  \
1320     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1321     bool sub_r = rot == 1 || rot == 2;                          \
1322     bool sub_i = rot >= 2;                                      \
1323     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1324     for (i = 0; i < opr_sz; i += 2) {                           \
1325         TYPE elt1_a = n[H(i + sel_a)];                          \
1326         TYPE elt2_a = m[H(i + sel_a)];                          \
1327         TYPE elt2_b = m[H(i + sel_b)];                          \
1328         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1329         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1330     }                                                           \
1331 }
1332 
1333 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1334 
1335 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1336 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1337 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1338 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1339 
1340 #define DO_SQRDMLAH_B(N, M, A, S) \
1341     do_sqrdmlah_b(N, M, A, S, true)
1342 #define DO_SQRDMLAH_H(N, M, A, S) \
1343     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1344 #define DO_SQRDMLAH_S(N, M, A, S) \
1345     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1346 #define DO_SQRDMLAH_D(N, M, A, S) \
1347     do_sqrdmlah_d(N, M, A, S, true)
1348 
1349 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1350 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1351 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1352 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1353 
1354 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1355 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1356 {                                                                           \
1357     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1358     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1359     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1360     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1361     bool sub_r = rot == 1 || rot == 2;                                      \
1362     bool sub_i = rot >= 2;                                                  \
1363     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1364     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1365         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1366         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1367         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1368             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1369             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1370             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1371         }                                                                   \
1372     }                                                                       \
1373 }
1374 
1375 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1376 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1377 
1378 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1379 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1380 
1381 #undef DO_CMLA
1382 #undef DO_CMLA_FUNC
1383 #undef DO_CMLA_IDX_FUNC
1384 #undef DO_SQRDMLAH_B
1385 #undef DO_SQRDMLAH_H
1386 #undef DO_SQRDMLAH_S
1387 #undef DO_SQRDMLAH_D
1388 
1389 /* Note N and M are 4 elements bundled into one unit. */
1390 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1391                          int sel_a, int sel_b, int sub_i)
1392 {
1393     for (int i = 0; i <= 1; i++) {
1394         int32_t elt1_r = (int8_t)(n >> (16 * i));
1395         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1396         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1397         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1398 
1399         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1400     }
1401     return a;
1402 }
1403 
1404 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1405                          int sel_a, int sel_b, int sub_i)
1406 {
1407     for (int i = 0; i <= 1; i++) {
1408         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1409         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1410         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1411         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1412 
1413         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1414     }
1415     return a;
1416 }
1417 
1418 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1419                               void *va, uint32_t desc)
1420 {
1421     int opr_sz = simd_oprsz(desc);
1422     int rot = simd_data(desc);
1423     int sel_a = rot & 1;
1424     int sel_b = sel_a ^ 1;
1425     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1426     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1427 
1428     for (int e = 0; e < opr_sz / 4; e++) {
1429         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1430     }
1431 }
1432 
1433 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1434                               void *va, uint32_t desc)
1435 {
1436     int opr_sz = simd_oprsz(desc);
1437     int rot = simd_data(desc);
1438     int sel_a = rot & 1;
1439     int sel_b = sel_a ^ 1;
1440     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1441     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1442 
1443     for (int e = 0; e < opr_sz / 8; e++) {
1444         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1445     }
1446 }
1447 
1448 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1449                              void *va, uint32_t desc)
1450 {
1451     int opr_sz = simd_oprsz(desc);
1452     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1453     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1454     int sel_a = rot & 1;
1455     int sel_b = sel_a ^ 1;
1456     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1457     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1458 
1459     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1460         uint32_t seg_m = m[seg + idx];
1461         for (int e = 0; e < 4; e++) {
1462             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1463                                    sel_a, sel_b, sub_i);
1464         }
1465     }
1466 }
1467 
1468 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1469                              void *va, uint32_t desc)
1470 {
1471     int seg, opr_sz = simd_oprsz(desc);
1472     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1473     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1474     int sel_a = rot & 1;
1475     int sel_b = sel_a ^ 1;
1476     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1477     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1478 
1479     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1480         uint64_t seg_m = m[seg + idx];
1481         for (int e = 0; e < 2; e++) {
1482             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1483                                    sel_a, sel_b, sub_i);
1484         }
1485     }
1486 }
1487 
1488 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1489 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1490 {                                                                       \
1491     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1492     intptr_t i, j, idx = simd_data(desc);                               \
1493     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1494     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1495         TYPE mm = m[i];                                                 \
1496         for (j = 0; j < segment; j++) {                                 \
1497             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1498         }                                                               \
1499     }                                                                   \
1500 }
1501 
1502 #define DO_SQRDMLAH_H(N, M, A) \
1503     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1504 #define DO_SQRDMLAH_S(N, M, A) \
1505     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1506 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1507 
1508 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1509 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1510 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1511 
1512 #define DO_SQRDMLSH_H(N, M, A) \
1513     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1514 #define DO_SQRDMLSH_S(N, M, A) \
1515     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1516 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1517 
1518 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1519 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1520 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1521 
1522 #undef DO_ZZXZ
1523 
1524 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1525 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1526 {                                                                         \
1527     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1528     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1529     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1530     for (i = 0; i < oprsz; i += 16) {                                     \
1531         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1532         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1533             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1534             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1535             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1536         }                                                                 \
1537     }                                                                     \
1538 }
1539 
1540 #define DO_MLA(N, M, A)  (A + N * M)
1541 
1542 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1543 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1544 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1545 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1546 
1547 #define DO_MLS(N, M, A)  (A - N * M)
1548 
1549 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1550 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1551 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1552 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1553 
1554 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1555 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1556 
1557 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1558 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1559 
1560 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1561 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1562 
1563 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1564 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1565 
1566 #undef DO_MLA
1567 #undef DO_MLS
1568 #undef DO_ZZXW
1569 
1570 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1571 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1572 {                                                                         \
1573     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1574     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1575     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1576     for (i = 0; i < oprsz; i += 16) {                                     \
1577         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1578         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1579             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1580             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1581         }                                                                 \
1582     }                                                                     \
1583 }
1584 
1585 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1586 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1587 
1588 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1589 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1590 
1591 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1592 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1593 
1594 #undef DO_ZZX
1595 
1596 #define DO_BITPERM(NAME, TYPE, OP) \
1597 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1598 {                                                              \
1599     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1600     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1601         TYPE nn = *(TYPE *)(vn + i);                           \
1602         TYPE mm = *(TYPE *)(vm + i);                           \
1603         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1604     }                                                          \
1605 }
1606 
1607 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1608 {
1609     uint64_t res = 0;
1610     int db, rb = 0;
1611 
1612     for (db = 0; db < n; ++db) {
1613         if ((mask >> db) & 1) {
1614             res |= ((data >> db) & 1) << rb;
1615             ++rb;
1616         }
1617     }
1618     return res;
1619 }
1620 
1621 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1622 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1623 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1624 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1625 
1626 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1627 {
1628     uint64_t res = 0;
1629     int rb, db = 0;
1630 
1631     for (rb = 0; rb < n; ++rb) {
1632         if ((mask >> rb) & 1) {
1633             res |= ((data >> db) & 1) << rb;
1634             ++db;
1635         }
1636     }
1637     return res;
1638 }
1639 
1640 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1641 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1642 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1643 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1644 
1645 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1646 {
1647     uint64_t resm = 0, resu = 0;
1648     int db, rbm = 0, rbu = 0;
1649 
1650     for (db = 0; db < n; ++db) {
1651         uint64_t val = (data >> db) & 1;
1652         if ((mask >> db) & 1) {
1653             resm |= val << rbm++;
1654         } else {
1655             resu |= val << rbu++;
1656         }
1657     }
1658 
1659     return resm | (resu << rbm);
1660 }
1661 
1662 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1663 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1664 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1665 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1666 
1667 #undef DO_BITPERM
1668 
1669 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1670 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1671 {                                                               \
1672     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1673     int sub_r = simd_data(desc);                                \
1674     if (sub_r) {                                                \
1675         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1676             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1677             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1678             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1679             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1680             acc_r = ADD_OP(acc_r, el2_i);                       \
1681             acc_i = SUB_OP(acc_i, el2_r);                       \
1682             *(TYPE *)(vd + H(i)) = acc_r;                       \
1683             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1684         }                                                       \
1685     } else {                                                    \
1686         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1687             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1688             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1689             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1690             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1691             acc_r = SUB_OP(acc_r, el2_i);                       \
1692             acc_i = ADD_OP(acc_i, el2_r);                       \
1693             *(TYPE *)(vd + H(i)) = acc_r;                       \
1694             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1695         }                                                       \
1696     }                                                           \
1697 }
1698 
1699 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1700 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1701 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1702 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1703 
1704 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1705 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1706 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1707 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1708 
1709 #undef DO_CADD
1710 
1711 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1712 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1713 {                                                              \
1714     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1715     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1716     int shift = simd_data(desc) >> 1;                          \
1717     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1718         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1719         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1720     }                                                          \
1721 }
1722 
1723 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1724 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1725 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1726 
1727 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1728 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1729 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1730 
1731 #undef DO_ZZI_SHLL
1732 
1733 /* Two-operand reduction expander, controlled by a predicate.
1734  * The difference between TYPERED and TYPERET has to do with
1735  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1736  * but TYPERET must be unsigned so that e.g. a 32-bit value
1737  * is not sign-extended to the ABI uint64_t return type.
1738  */
1739 /* ??? If we were to vectorize this by hand the reduction ordering
1740  * would change.  For integer operands, this is perfectly fine.
1741  */
1742 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1743 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1744 {                                                          \
1745     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1746     TYPERED ret = INIT;                                    \
1747     for (i = 0; i < opr_sz; ) {                            \
1748         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1749         do {                                               \
1750             if (pg & 1) {                                  \
1751                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1752                 ret = OP(ret, nn);                         \
1753             }                                              \
1754             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1755         } while (i & 15);                                  \
1756     }                                                      \
1757     return (TYPERET)ret;                                   \
1758 }
1759 
1760 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1761 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1762 {                                                          \
1763     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1764     TYPEE *n = vn;                                         \
1765     uint8_t *pg = vg;                                      \
1766     TYPER ret = INIT;                                      \
1767     for (i = 0; i < opr_sz; i += 1) {                      \
1768         if (pg[H1(i)] & 1) {                               \
1769             TYPEE nn = n[i];                               \
1770             ret = OP(ret, nn);                             \
1771         }                                                  \
1772     }                                                      \
1773     return ret;                                            \
1774 }
1775 
1776 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1777 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1778 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1779 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1780 
1781 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1782 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1783 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1784 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1785 
1786 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1787 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1788 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1789 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1790 
1791 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1792 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1793 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1794 
1795 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1796 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1797 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1798 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1799 
1800 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1801 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1802 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1803 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1804 
1805 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1806 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1807 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1808 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1809 
1810 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1811 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1812 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1813 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1814 
1815 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1816 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1817 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1818 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1819 
1820 #undef DO_VPZ
1821 #undef DO_VPZ_D
1822 
1823 #define DO_VPQ(NAME, TYPE, H, INIT, OP) \
1824 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)          \
1825 {                                                                       \
1826     TYPE tmp[16 / sizeof(TYPE)] = { [0 ... 16 / sizeof(TYPE) - 1] = INIT }; \
1827     TYPE *n = vn; uint16_t *g = vg;                                     \
1828     uintptr_t oprsz = simd_oprsz(desc);                                 \
1829     uintptr_t nseg = oprsz / 16, nsegelt = 16 / sizeof(TYPE);           \
1830     for (uintptr_t s = 0; s < nseg; s++) {                              \
1831         uint16_t pg = g[H2(s)];                                         \
1832         for (uintptr_t e = 0; e < nsegelt; e++, pg >>= sizeof(TYPE)) {  \
1833             if (pg & 1) {                                               \
1834                 tmp[e] = OP(tmp[H(e)], n[s * nsegelt + H(e)]);          \
1835             }                                                           \
1836         }                                                               \
1837     }                                                                   \
1838     memcpy(vd, tmp, 16);                                                \
1839     clear_tail(vd, 16, simd_maxsz(desc));                               \
1840 }
1841 
1842 DO_VPQ(sve2p1_addqv_b, uint8_t, H1, 0, DO_ADD)
1843 DO_VPQ(sve2p1_addqv_h, uint16_t, H2, 0, DO_ADD)
1844 DO_VPQ(sve2p1_addqv_s, uint32_t, H4, 0, DO_ADD)
1845 DO_VPQ(sve2p1_addqv_d, uint64_t, H8, 0, DO_ADD)
1846 
1847 DO_VPQ(sve2p1_smaxqv_b, int8_t, H1, INT8_MIN, DO_MAX)
1848 DO_VPQ(sve2p1_smaxqv_h, int16_t, H2, INT16_MIN, DO_MAX)
1849 DO_VPQ(sve2p1_smaxqv_s, int32_t, H4, INT32_MIN, DO_MAX)
1850 DO_VPQ(sve2p1_smaxqv_d, int64_t, H8, INT64_MIN, DO_MAX)
1851 
1852 DO_VPQ(sve2p1_sminqv_b, int8_t, H1, INT8_MAX, DO_MIN)
1853 DO_VPQ(sve2p1_sminqv_h, int16_t, H2, INT16_MAX, DO_MIN)
1854 DO_VPQ(sve2p1_sminqv_s, int32_t, H4, INT32_MAX, DO_MIN)
1855 DO_VPQ(sve2p1_sminqv_d, int64_t, H8, INT64_MAX, DO_MIN)
1856 
1857 DO_VPQ(sve2p1_umaxqv_b, uint8_t, H1, 0, DO_MAX)
1858 DO_VPQ(sve2p1_umaxqv_h, uint16_t, H2, 0, DO_MAX)
1859 DO_VPQ(sve2p1_umaxqv_s, uint32_t, H4, 0, DO_MAX)
1860 DO_VPQ(sve2p1_umaxqv_d, uint64_t, H8, 0, DO_MAX)
1861 
1862 DO_VPQ(sve2p1_uminqv_b, uint8_t, H1, -1, DO_MIN)
1863 DO_VPQ(sve2p1_uminqv_h, uint16_t, H2, -1, DO_MIN)
1864 DO_VPQ(sve2p1_uminqv_s, uint32_t, H4, -1, DO_MIN)
1865 DO_VPQ(sve2p1_uminqv_d, uint64_t, H8, -1, DO_MIN)
1866 
1867 #undef DO_VPQ
1868 
1869 /* Two vector operand, one scalar operand, unpredicated.  */
1870 #define DO_ZZI(NAME, TYPE, OP)                                       \
1871 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1872 {                                                                    \
1873     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1874     TYPE s = s64, *d = vd, *n = vn;                                  \
1875     for (i = 0; i < opr_sz; ++i) {                                   \
1876         d[i] = OP(n[i], s);                                          \
1877     }                                                                \
1878 }
1879 
1880 #define DO_SUBR(X, Y)   (Y - X)
1881 
1882 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1883 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1884 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1885 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1886 
1887 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1888 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1889 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1890 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1891 
1892 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1893 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1894 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1895 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1896 
1897 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1898 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1899 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1900 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1901 
1902 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1903 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1904 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1905 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1906 
1907 #undef DO_ZZI
1908 
1909 #define DO_LOGIC_QV(NAME, SUFF, INIT, VOP, POP)                         \
1910 void HELPER(NAME ## _ ## SUFF)(void *vd, void *vn, void *vg, uint32_t desc) \
1911 {                                                                       \
1912     unsigned seg = simd_oprsz(desc) / 16;                               \
1913     uint64_t r0 = INIT, r1 = INIT;                                      \
1914     for (unsigned s = 0; s < seg; s++) {                                \
1915         uint64_t p0 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2))); \
1916         uint64_t p1 = expand_pred_##SUFF(*(uint8_t *)(vg + H1(s * 2 + 1))); \
1917         uint64_t v0 = *(uint64_t *)(vn + s * 16);                       \
1918         uint64_t v1 = *(uint64_t *)(vn + s * 16 + 8);                   \
1919         v0 = POP(v0, p0), v1 = POP(v1, p1);                             \
1920         r0 = VOP(r0, v0), r1 = VOP(r1, v1);                             \
1921     }                                                                   \
1922     *(uint64_t *)(vd + 0) = r0;                                         \
1923     *(uint64_t *)(vd + 8) = r1;                                         \
1924     clear_tail(vd, 16, simd_maxsz(desc));                               \
1925 }
1926 
1927 DO_LOGIC_QV(sve2p1_orqv, b, 0, DO_ORR, DO_AND)
1928 DO_LOGIC_QV(sve2p1_orqv, h, 0, DO_ORR, DO_AND)
1929 DO_LOGIC_QV(sve2p1_orqv, s, 0, DO_ORR, DO_AND)
1930 DO_LOGIC_QV(sve2p1_orqv, d, 0, DO_ORR, DO_AND)
1931 
1932 DO_LOGIC_QV(sve2p1_eorqv, b, 0, DO_EOR, DO_AND)
1933 DO_LOGIC_QV(sve2p1_eorqv, h, 0, DO_EOR, DO_AND)
1934 DO_LOGIC_QV(sve2p1_eorqv, s, 0, DO_EOR, DO_AND)
1935 DO_LOGIC_QV(sve2p1_eorqv, d, 0, DO_EOR, DO_AND)
1936 
1937 DO_LOGIC_QV(sve2p1_andqv, b, -1, DO_AND, DO_ORC)
1938 DO_LOGIC_QV(sve2p1_andqv, h, -1, DO_AND, DO_ORC)
1939 DO_LOGIC_QV(sve2p1_andqv, s, -1, DO_AND, DO_ORC)
1940 DO_LOGIC_QV(sve2p1_andqv, d, -1, DO_AND, DO_ORC)
1941 
1942 #undef DO_LOGIC_QV
1943 
1944 #undef DO_AND
1945 #undef DO_ORR
1946 #undef DO_EOR
1947 #undef DO_BIC
1948 #undef DO_ORC
1949 #undef DO_ADD
1950 #undef DO_SUB
1951 #undef DO_MAX
1952 #undef DO_MIN
1953 #undef DO_ABD
1954 #undef DO_MUL
1955 #undef DO_DIV
1956 #undef DO_ASR
1957 #undef DO_LSR
1958 #undef DO_LSL
1959 #undef DO_SUBR
1960 
1961 /* Similar to the ARM LastActiveElement pseudocode function, except the
1962    result is multiplied by the element size.  This includes the not found
1963    indication; e.g. not found for esz=3 is -8.  */
1964 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1965 {
1966     uint64_t mask = pred_esz_masks[esz];
1967     intptr_t i = words;
1968 
1969     do {
1970         uint64_t this_g = g[--i] & mask;
1971         if (this_g) {
1972             return i * 64 + (63 - clz64(this_g));
1973         }
1974     } while (i > 0);
1975     return (intptr_t)-1 << esz;
1976 }
1977 
1978 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1979 {
1980     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1981     uint32_t flags = PREDTEST_INIT;
1982     uint64_t *d = vd, *g = vg;
1983     intptr_t i = 0;
1984 
1985     do {
1986         uint64_t this_d = d[i];
1987         uint64_t this_g = g[i];
1988 
1989         if (this_g) {
1990             if (!(flags & 4)) {
1991                 /* Set in D the first bit of G.  */
1992                 this_d |= this_g & -this_g;
1993                 d[i] = this_d;
1994             }
1995             flags = iter_predtest_fwd(this_d, this_g, flags);
1996         }
1997     } while (++i < words);
1998 
1999     return flags;
2000 }
2001 
2002 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
2003 {
2004     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2005     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2006     uint32_t flags = PREDTEST_INIT;
2007     uint64_t *d = vd, *g = vg, esz_mask;
2008     intptr_t i, next;
2009 
2010     next = last_active_element(vd, words, esz) + (1 << esz);
2011     esz_mask = pred_esz_masks[esz];
2012 
2013     /* Similar to the pseudocode for pnext, but scaled by ESZ
2014        so that we find the correct bit.  */
2015     if (next < words * 64) {
2016         uint64_t mask = -1;
2017 
2018         if (next & 63) {
2019             mask = ~((1ull << (next & 63)) - 1);
2020             next &= -64;
2021         }
2022         do {
2023             uint64_t this_g = g[next / 64] & esz_mask & mask;
2024             if (this_g != 0) {
2025                 next = (next & -64) + ctz64(this_g);
2026                 break;
2027             }
2028             next += 64;
2029             mask = -1;
2030         } while (next < words * 64);
2031     }
2032 
2033     i = 0;
2034     do {
2035         uint64_t this_d = 0;
2036         if (i == next / 64) {
2037             this_d = 1ull << (next & 63);
2038         }
2039         d[i] = this_d;
2040         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
2041     } while (++i < words);
2042 
2043     return flags;
2044 }
2045 
2046 /*
2047  * Copy Zn into Zd, and store zero into inactive elements.
2048  * If inv, store zeros into the active elements.
2049  */
2050 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
2051 {
2052     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2053     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2054     uint64_t *d = vd, *n = vn;
2055     uint8_t *pg = vg;
2056 
2057     for (i = 0; i < opr_sz; i += 1) {
2058         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
2059     }
2060 }
2061 
2062 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
2063 {
2064     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2065     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2066     uint64_t *d = vd, *n = vn;
2067     uint8_t *pg = vg;
2068 
2069     for (i = 0; i < opr_sz; i += 1) {
2070         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2071     }
2072 }
2073 
2074 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2075 {
2076     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2077     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2078     uint64_t *d = vd, *n = vn;
2079     uint8_t *pg = vg;
2080 
2081     for (i = 0; i < opr_sz; i += 1) {
2082         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2083     }
2084 }
2085 
2086 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2087 {
2088     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2089     uint64_t *d = vd, *n = vn;
2090     uint8_t *pg = vg;
2091     uint8_t inv = simd_data(desc);
2092 
2093     for (i = 0; i < opr_sz; i += 1) {
2094         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2095     }
2096 }
2097 
2098 /* Three-operand expander, immediate operand, controlled by a predicate.
2099  */
2100 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2101 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2102 {                                                               \
2103     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2104     TYPE imm = simd_data(desc);                                 \
2105     for (i = 0; i < opr_sz; ) {                                 \
2106         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2107         do {                                                    \
2108             if (pg & 1) {                                       \
2109                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2110                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2111             }                                                   \
2112             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2113         } while (i & 15);                                       \
2114     }                                                           \
2115 }
2116 
2117 /* Similarly, specialized for 64-bit operands.  */
2118 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2119 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2120 {                                                               \
2121     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2122     TYPE *d = vd, *n = vn;                                      \
2123     TYPE imm = simd_data(desc);                                 \
2124     uint8_t *pg = vg;                                           \
2125     for (i = 0; i < opr_sz; i += 1) {                           \
2126         if (pg[H1(i)] & 1) {                                    \
2127             TYPE nn = n[i];                                     \
2128             d[i] = OP(nn, imm);                                 \
2129         }                                                       \
2130     }                                                           \
2131 }
2132 
2133 #define DO_SHR(N, M)  (N >> M)
2134 #define DO_SHL(N, M)  (N << M)
2135 
2136 /* Arithmetic shift right for division.  This rounds negative numbers
2137    toward zero as per signed division.  Therefore before shifting,
2138    when N is negative, add 2**M-1.  */
2139 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2140 
2141 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2142 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2143 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2144 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2145 
2146 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2147 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2148 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2149 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2150 
2151 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2152 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2153 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2154 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2155 
2156 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2157 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2158 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2159 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2160 
2161 /* SVE2 bitwise shift by immediate */
2162 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2163 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2164 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2165 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2166 
2167 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2168 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2169 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2170 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2171 
2172 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2173 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2174 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2175 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2176 
2177 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2178 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2179 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2180 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2181 
2182 #define do_suqrshl_b(n, m) \
2183    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2184 #define do_suqrshl_h(n, m) \
2185    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2186 #define do_suqrshl_s(n, m) \
2187    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2188 #define do_suqrshl_d(n, m) \
2189    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2190 
2191 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2192 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2193 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2194 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2195 
2196 #undef DO_ASRD
2197 #undef DO_ZPZI
2198 #undef DO_ZPZI_D
2199 
2200 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2201 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2202 {                                                            \
2203     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2204     int shift = simd_data(desc);                             \
2205     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2206         TYPEW nn = *(TYPEW *)(vn + i);                       \
2207         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2208     }                                                        \
2209 }
2210 
2211 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2212 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2213 {                                                                 \
2214     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2215     int shift = simd_data(desc);                                  \
2216     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2217         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2218         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2219     }                                                             \
2220 }
2221 
2222 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2223 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2224 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2225 
2226 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2227 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2228 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2229 
2230 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2231 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2232 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2233 
2234 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2235 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2236 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2237 
2238 #define DO_SQSHRUN_H(x, sh) do_usat_b((int64_t)(x) >> sh)
2239 #define DO_SQSHRUN_S(x, sh) do_usat_h((int64_t)(x) >> sh)
2240 #define DO_SQSHRUN_D(x, sh) do_usat_s((int64_t)(x) >> (sh < 64 ? sh : 63))
2241 
2242 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2243 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2244 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2245 
2246 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2247 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2248 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2249 
2250 #define DO_SQRSHRUN_H(x, sh) do_usat_b(do_srshr(x, sh))
2251 #define DO_SQRSHRUN_S(x, sh) do_usat_h(do_srshr(x, sh))
2252 #define DO_SQRSHRUN_D(x, sh) do_usat_s(do_srshr(x, sh))
2253 
2254 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2255 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2256 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2257 
2258 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2259 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2260 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2261 
2262 #define DO_SQSHRN_H(x, sh) do_ssat_b(x >> sh)
2263 #define DO_SQSHRN_S(x, sh) do_ssat_h(x >> sh)
2264 #define DO_SQSHRN_D(x, sh) do_ssat_s(x >> sh)
2265 
2266 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2267 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2268 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2269 
2270 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2271 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2272 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2273 
2274 #define DO_SQRSHRN_H(x, sh) do_ssat_b(do_srshr(x, sh))
2275 #define DO_SQRSHRN_S(x, sh) do_ssat_h(do_srshr(x, sh))
2276 #define DO_SQRSHRN_D(x, sh) do_ssat_s(do_srshr(x, sh))
2277 
2278 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2279 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2280 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2281 
2282 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2283 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2284 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2285 
2286 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2287 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2288 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2289 
2290 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2291 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2292 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2293 
2294 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2295 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2296 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2297 
2298 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2299 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2300 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2301 
2302 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2303 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2304 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2305 
2306 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2307 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2308 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2309 
2310 #undef DO_SHRNB
2311 #undef DO_SHRNT
2312 
2313 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2314 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2315 {                                                                           \
2316     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2317     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2318         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2319         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2320         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2321     }                                                                       \
2322 }
2323 
2324 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2325 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2326 {                                                                           \
2327     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2328     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2329         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2330         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2331         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2332     }                                                                       \
2333 }
2334 
2335 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2336 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2337 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2338 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2339 
2340 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2341 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2342 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2343 
2344 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2345 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2346 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2347 
2348 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2349 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2350 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2351 
2352 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2353 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2354 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2355 
2356 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2357 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2358 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2359 
2360 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2361 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2362 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2363 
2364 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2365 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2366 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2367 
2368 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2369 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2370 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2371 
2372 #undef DO_RSUBHN
2373 #undef DO_SUBHN
2374 #undef DO_RADDHN
2375 #undef DO_ADDHN
2376 
2377 #undef DO_BINOPNB
2378 
2379 /* Fully general four-operand expander, controlled by a predicate.
2380  */
2381 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2382 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2383                   void *vg, uint32_t desc)                    \
2384 {                                                             \
2385     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2386     for (i = 0; i < opr_sz; ) {                               \
2387         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2388         do {                                                  \
2389             if (pg & 1) {                                     \
2390                 TYPE nn = *(TYPE *)(vn + H(i));               \
2391                 TYPE mm = *(TYPE *)(vm + H(i));               \
2392                 TYPE aa = *(TYPE *)(va + H(i));               \
2393                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2394             }                                                 \
2395             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2396         } while (i & 15);                                     \
2397     }                                                         \
2398 }
2399 
2400 /* Similarly, specialized for 64-bit operands.  */
2401 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2402 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2403                   void *vg, uint32_t desc)                    \
2404 {                                                             \
2405     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2406     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2407     uint8_t *pg = vg;                                         \
2408     for (i = 0; i < opr_sz; i += 1) {                         \
2409         if (pg[H1(i)] & 1) {                                  \
2410             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2411             d[i] = OP(aa, nn, mm);                            \
2412         }                                                     \
2413     }                                                         \
2414 }
2415 
2416 #define DO_MLA(A, N, M)  (A + N * M)
2417 #define DO_MLS(A, N, M)  (A - N * M)
2418 
2419 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2420 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2421 
2422 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2423 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2424 
2425 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2426 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2427 
2428 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2429 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2430 
2431 #undef DO_MLA
2432 #undef DO_MLS
2433 #undef DO_ZPZZZ
2434 #undef DO_ZPZZZ_D
2435 
2436 void HELPER(sve_index_b)(void *vd, uint32_t start,
2437                          uint32_t incr, uint32_t desc)
2438 {
2439     intptr_t i, opr_sz = simd_oprsz(desc);
2440     uint8_t *d = vd;
2441     for (i = 0; i < opr_sz; i += 1) {
2442         d[H1(i)] = start + i * incr;
2443     }
2444 }
2445 
2446 void HELPER(sve_index_h)(void *vd, uint32_t start,
2447                          uint32_t incr, uint32_t desc)
2448 {
2449     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2450     uint16_t *d = vd;
2451     for (i = 0; i < opr_sz; i += 1) {
2452         d[H2(i)] = start + i * incr;
2453     }
2454 }
2455 
2456 void HELPER(sve_index_s)(void *vd, uint32_t start,
2457                          uint32_t incr, uint32_t desc)
2458 {
2459     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2460     uint32_t *d = vd;
2461     for (i = 0; i < opr_sz; i += 1) {
2462         d[H4(i)] = start + i * incr;
2463     }
2464 }
2465 
2466 void HELPER(sve_index_d)(void *vd, uint64_t start,
2467                          uint64_t incr, uint32_t desc)
2468 {
2469     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2470     uint64_t *d = vd;
2471     for (i = 0; i < opr_sz; i += 1) {
2472         d[i] = start + i * incr;
2473     }
2474 }
2475 
2476 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2477 {
2478     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2479     uint32_t sh = simd_data(desc);
2480     uint32_t *d = vd, *n = vn, *m = vm;
2481     for (i = 0; i < opr_sz; i += 1) {
2482         d[i] = n[i] + (m[i] << sh);
2483     }
2484 }
2485 
2486 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2487 {
2488     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2489     uint64_t sh = simd_data(desc);
2490     uint64_t *d = vd, *n = vn, *m = vm;
2491     for (i = 0; i < opr_sz; i += 1) {
2492         d[i] = n[i] + (m[i] << sh);
2493     }
2494 }
2495 
2496 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2497 {
2498     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2499     uint64_t sh = simd_data(desc);
2500     uint64_t *d = vd, *n = vn, *m = vm;
2501     for (i = 0; i < opr_sz; i += 1) {
2502         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2503     }
2504 }
2505 
2506 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2507 {
2508     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2509     uint64_t sh = simd_data(desc);
2510     uint64_t *d = vd, *n = vn, *m = vm;
2511     for (i = 0; i < opr_sz; i += 1) {
2512         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2513     }
2514 }
2515 
2516 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2517 {
2518     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2519     static const uint16_t coeff[] = {
2520         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2521         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2522         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2523         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2524     };
2525     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2526     uint16_t *d = vd, *n = vn;
2527 
2528     for (i = 0; i < opr_sz; i++) {
2529         uint16_t nn = n[i];
2530         intptr_t idx = extract32(nn, 0, 5);
2531         uint16_t exp = extract32(nn, 5, 5);
2532         d[i] = coeff[idx] | (exp << 10);
2533     }
2534 }
2535 
2536 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2537 {
2538     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2539     static const uint32_t coeff[] = {
2540         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2541         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2542         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2543         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2544         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2545         0x1ef532, 0x20b051, 0x227043, 0x243516,
2546         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2547         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2548         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2549         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2550         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2551         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2552         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2553         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2554         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2555         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2556     };
2557     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2558     uint32_t *d = vd, *n = vn;
2559 
2560     for (i = 0; i < opr_sz; i++) {
2561         uint32_t nn = n[i];
2562         intptr_t idx = extract32(nn, 0, 6);
2563         uint32_t exp = extract32(nn, 6, 8);
2564         d[i] = coeff[idx] | (exp << 23);
2565     }
2566 }
2567 
2568 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2569 {
2570     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2571     static const uint64_t coeff[] = {
2572         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2573         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2574         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2575         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2576         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2577         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2578         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2579         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2580         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2581         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2582         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2583         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2584         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2585         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2586         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2587         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2588         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2589         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2590         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2591         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2592         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2593         0xFA7C1819E90D8ull,
2594     };
2595     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2596     uint64_t *d = vd, *n = vn;
2597 
2598     for (i = 0; i < opr_sz; i++) {
2599         uint64_t nn = n[i];
2600         intptr_t idx = extract32(nn, 0, 6);
2601         uint64_t exp = extract32(nn, 6, 11);
2602         d[i] = coeff[idx] | (exp << 52);
2603     }
2604 }
2605 
2606 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2607 {
2608     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2609     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2610     uint16_t *d = vd, *n = vn, *m = vm;
2611     for (i = 0; i < opr_sz; i += 1) {
2612         uint16_t nn = n[i];
2613         uint16_t mm = m[i];
2614         if (mm & 1) {
2615             nn = float16_one;
2616         }
2617         if (mm & 2) {
2618             nn = float16_maybe_ah_chs(nn, fpcr_ah);
2619         }
2620         d[i] = nn;
2621     }
2622 }
2623 
2624 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2625 {
2626     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2627     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2628     uint32_t *d = vd, *n = vn, *m = vm;
2629     for (i = 0; i < opr_sz; i += 1) {
2630         uint32_t nn = n[i];
2631         uint32_t mm = m[i];
2632         if (mm & 1) {
2633             nn = float32_one;
2634         }
2635         if (mm & 2) {
2636             nn = float32_maybe_ah_chs(nn, fpcr_ah);
2637         }
2638         d[i] = nn;
2639     }
2640 }
2641 
2642 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2643 {
2644     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2645     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2646     uint64_t *d = vd, *n = vn, *m = vm;
2647     for (i = 0; i < opr_sz; i += 1) {
2648         uint64_t nn = n[i];
2649         uint64_t mm = m[i];
2650         if (mm & 1) {
2651             nn = float64_one;
2652         }
2653         if (mm & 2) {
2654             nn = float64_maybe_ah_chs(nn, fpcr_ah);
2655         }
2656         d[i] = nn;
2657     }
2658 }
2659 
2660 /*
2661  * Signed saturating addition with scalar operand.
2662  */
2663 
2664 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2665 {
2666     intptr_t i, oprsz = simd_oprsz(desc);
2667 
2668     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2669         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2670     }
2671 }
2672 
2673 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2674 {
2675     intptr_t i, oprsz = simd_oprsz(desc);
2676 
2677     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2678         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2679     }
2680 }
2681 
2682 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2683 {
2684     intptr_t i, oprsz = simd_oprsz(desc);
2685 
2686     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2687         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2688     }
2689 }
2690 
2691 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2692 {
2693     intptr_t i, oprsz = simd_oprsz(desc);
2694 
2695     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2696         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2697     }
2698 }
2699 
2700 /*
2701  * Unsigned saturating addition with scalar operand.
2702  */
2703 
2704 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2705 {
2706     intptr_t i, oprsz = simd_oprsz(desc);
2707 
2708     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2709         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2710     }
2711 }
2712 
2713 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2714 {
2715     intptr_t i, oprsz = simd_oprsz(desc);
2716 
2717     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2718         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2719     }
2720 }
2721 
2722 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2723 {
2724     intptr_t i, oprsz = simd_oprsz(desc);
2725 
2726     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2727         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2728     }
2729 }
2730 
2731 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2732 {
2733     intptr_t i, oprsz = simd_oprsz(desc);
2734 
2735     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2736         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2737     }
2738 }
2739 
2740 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2741 {
2742     intptr_t i, oprsz = simd_oprsz(desc);
2743 
2744     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2745         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2746     }
2747 }
2748 
2749 /* Two operand predicated copy immediate with merge.  All valid immediates
2750  * can fit within 17 signed bits in the simd_data field.
2751  */
2752 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2753                          uint64_t mm, uint32_t desc)
2754 {
2755     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2756     uint64_t *d = vd, *n = vn;
2757     uint8_t *pg = vg;
2758 
2759     mm = dup_const(MO_8, mm);
2760     for (i = 0; i < opr_sz; i += 1) {
2761         uint64_t nn = n[i];
2762         uint64_t pp = expand_pred_b(pg[H1(i)]);
2763         d[i] = (mm & pp) | (nn & ~pp);
2764     }
2765 }
2766 
2767 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2768                          uint64_t mm, uint32_t desc)
2769 {
2770     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2771     uint64_t *d = vd, *n = vn;
2772     uint8_t *pg = vg;
2773 
2774     mm = dup_const(MO_16, mm);
2775     for (i = 0; i < opr_sz; i += 1) {
2776         uint64_t nn = n[i];
2777         uint64_t pp = expand_pred_h(pg[H1(i)]);
2778         d[i] = (mm & pp) | (nn & ~pp);
2779     }
2780 }
2781 
2782 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2783                          uint64_t mm, uint32_t desc)
2784 {
2785     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2786     uint64_t *d = vd, *n = vn;
2787     uint8_t *pg = vg;
2788 
2789     mm = dup_const(MO_32, mm);
2790     for (i = 0; i < opr_sz; i += 1) {
2791         uint64_t nn = n[i];
2792         uint64_t pp = expand_pred_s(pg[H1(i)]);
2793         d[i] = (mm & pp) | (nn & ~pp);
2794     }
2795 }
2796 
2797 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2798                          uint64_t mm, uint32_t desc)
2799 {
2800     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2801     uint64_t *d = vd, *n = vn;
2802     uint8_t *pg = vg;
2803 
2804     for (i = 0; i < opr_sz; i += 1) {
2805         uint64_t nn = n[i];
2806         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2807     }
2808 }
2809 
2810 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2811 {
2812     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2813     uint64_t *d = vd;
2814     uint8_t *pg = vg;
2815 
2816     val = dup_const(MO_8, val);
2817     for (i = 0; i < opr_sz; i += 1) {
2818         d[i] = val & expand_pred_b(pg[H1(i)]);
2819     }
2820 }
2821 
2822 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2823 {
2824     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2825     uint64_t *d = vd;
2826     uint8_t *pg = vg;
2827 
2828     val = dup_const(MO_16, val);
2829     for (i = 0; i < opr_sz; i += 1) {
2830         d[i] = val & expand_pred_h(pg[H1(i)]);
2831     }
2832 }
2833 
2834 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2835 {
2836     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2837     uint64_t *d = vd;
2838     uint8_t *pg = vg;
2839 
2840     val = dup_const(MO_32, val);
2841     for (i = 0; i < opr_sz; i += 1) {
2842         d[i] = val & expand_pred_s(pg[H1(i)]);
2843     }
2844 }
2845 
2846 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2847 {
2848     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2849     uint64_t *d = vd;
2850     uint8_t *pg = vg;
2851 
2852     for (i = 0; i < opr_sz; i += 1) {
2853         d[i] = (pg[H1(i)] & 1 ? val : 0);
2854     }
2855 }
2856 
2857 /* Big-endian hosts need to frob the byte indices.  If the copy
2858  * happens to be 8-byte aligned, then no frobbing necessary.
2859  */
2860 static void swap_memmove(void *vd, void *vs, size_t n)
2861 {
2862     uintptr_t d = (uintptr_t)vd;
2863     uintptr_t s = (uintptr_t)vs;
2864     uintptr_t o = (d | s | n) & 7;
2865     size_t i;
2866 
2867 #if !HOST_BIG_ENDIAN
2868     o = 0;
2869 #endif
2870     switch (o) {
2871     case 0:
2872         memmove(vd, vs, n);
2873         break;
2874 
2875     case 4:
2876         if (d < s || d >= s + n) {
2877             for (i = 0; i < n; i += 4) {
2878                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2879             }
2880         } else {
2881             for (i = n; i > 0; ) {
2882                 i -= 4;
2883                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2884             }
2885         }
2886         break;
2887 
2888     case 2:
2889     case 6:
2890         if (d < s || d >= s + n) {
2891             for (i = 0; i < n; i += 2) {
2892                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2893             }
2894         } else {
2895             for (i = n; i > 0; ) {
2896                 i -= 2;
2897                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2898             }
2899         }
2900         break;
2901 
2902     default:
2903         if (d < s || d >= s + n) {
2904             for (i = 0; i < n; i++) {
2905                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2906             }
2907         } else {
2908             for (i = n; i > 0; ) {
2909                 i -= 1;
2910                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2911             }
2912         }
2913         break;
2914     }
2915 }
2916 
2917 /* Similarly for memset of 0.  */
2918 static void swap_memzero(void *vd, size_t n)
2919 {
2920     uintptr_t d = (uintptr_t)vd;
2921     uintptr_t o = (d | n) & 7;
2922     size_t i;
2923 
2924     /* Usually, the first bit of a predicate is set, so N is 0.  */
2925     if (likely(n == 0)) {
2926         return;
2927     }
2928 
2929 #if !HOST_BIG_ENDIAN
2930     o = 0;
2931 #endif
2932     switch (o) {
2933     case 0:
2934         memset(vd, 0, n);
2935         break;
2936 
2937     case 4:
2938         for (i = 0; i < n; i += 4) {
2939             *(uint32_t *)H1_4(d + i) = 0;
2940         }
2941         break;
2942 
2943     case 2:
2944     case 6:
2945         for (i = 0; i < n; i += 2) {
2946             *(uint16_t *)H1_2(d + i) = 0;
2947         }
2948         break;
2949 
2950     default:
2951         for (i = 0; i < n; i++) {
2952             *(uint8_t *)H1(d + i) = 0;
2953         }
2954         break;
2955     }
2956 }
2957 
2958 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2959 {
2960     intptr_t opr_sz = simd_oprsz(desc);
2961     size_t n_ofs = simd_data(desc);
2962     size_t n_siz = opr_sz - n_ofs;
2963 
2964     if (vd != vm) {
2965         swap_memmove(vd, vn + n_ofs, n_siz);
2966         swap_memmove(vd + n_siz, vm, n_ofs);
2967     } else if (vd != vn) {
2968         swap_memmove(vd + n_siz, vd, n_ofs);
2969         swap_memmove(vd, vn + n_ofs, n_siz);
2970     } else {
2971         /* vd == vn == vm.  Need temp space.  */
2972         ARMVectorReg tmp;
2973         swap_memmove(&tmp, vm, n_ofs);
2974         swap_memmove(vd, vd + n_ofs, n_siz);
2975         memcpy(vd + n_siz, &tmp, n_ofs);
2976     }
2977 }
2978 
2979 #define DO_INSR(NAME, TYPE, H) \
2980 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2981 {                                                                  \
2982     intptr_t opr_sz = simd_oprsz(desc);                            \
2983     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2984     *(TYPE *)(vd + H(0)) = val;                                    \
2985 }
2986 
2987 DO_INSR(sve_insr_b, uint8_t, H1)
2988 DO_INSR(sve_insr_h, uint16_t, H1_2)
2989 DO_INSR(sve_insr_s, uint32_t, H1_4)
2990 DO_INSR(sve_insr_d, uint64_t, H1_8)
2991 
2992 #undef DO_INSR
2993 
2994 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2995 {
2996     intptr_t i, j, opr_sz = simd_oprsz(desc);
2997     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2998         uint64_t f = *(uint64_t *)(vn + i);
2999         uint64_t b = *(uint64_t *)(vn + j);
3000         *(uint64_t *)(vd + i) = bswap64(b);
3001         *(uint64_t *)(vd + j) = bswap64(f);
3002     }
3003 }
3004 
3005 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
3006 {
3007     intptr_t i, j, opr_sz = simd_oprsz(desc);
3008     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3009         uint64_t f = *(uint64_t *)(vn + i);
3010         uint64_t b = *(uint64_t *)(vn + j);
3011         *(uint64_t *)(vd + i) = hswap64(b);
3012         *(uint64_t *)(vd + j) = hswap64(f);
3013     }
3014 }
3015 
3016 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
3017 {
3018     intptr_t i, j, opr_sz = simd_oprsz(desc);
3019     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3020         uint64_t f = *(uint64_t *)(vn + i);
3021         uint64_t b = *(uint64_t *)(vn + j);
3022         *(uint64_t *)(vd + i) = rol64(b, 32);
3023         *(uint64_t *)(vd + j) = rol64(f, 32);
3024     }
3025 }
3026 
3027 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
3028 {
3029     intptr_t i, j, opr_sz = simd_oprsz(desc);
3030     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
3031         uint64_t f = *(uint64_t *)(vn + i);
3032         uint64_t b = *(uint64_t *)(vn + j);
3033         *(uint64_t *)(vd + i) = b;
3034         *(uint64_t *)(vd + j) = f;
3035     }
3036 }
3037 
3038 /*
3039  * TODO: This could use half_shuffle64 and similar bit tricks to
3040  * expand blocks of bits at once.
3041  */
3042 #define DO_PMOV_PV(NAME, ESIZE)                                 \
3043 void HELPER(NAME)(void *vd, void *vs, uint32_t desc)            \
3044 {                                                               \
3045     unsigned vl = simd_oprsz(desc);                             \
3046     unsigned idx = simd_data(desc);                             \
3047     unsigned elements = vl / ESIZE;                             \
3048     ARMPredicateReg *d = vd;                                    \
3049     ARMVectorReg *s = vs;                                       \
3050     memset(d, 0, sizeof(*d));                                   \
3051     for (unsigned e = 0; e < elements; ++e) {                   \
3052         depositn(d->p, e * ESIZE, 1, extractn(s->d, elements * idx + e, 1)); \
3053     }                                                           \
3054 }
3055 
3056 DO_PMOV_PV(pmov_pv_h, 2)
3057 DO_PMOV_PV(pmov_pv_s, 4)
3058 DO_PMOV_PV(pmov_pv_d, 8)
3059 
3060 #undef DO_PMOV_PV
3061 
3062 /*
3063  * TODO: This could use half_unshuffle64 and similar bit tricks to
3064  * compress blocks of bits at once.
3065  */
3066 #define DO_PMOV_VP(NAME, ESIZE)                                 \
3067 void HELPER(NAME)(void *vd, void *vs, uint32_t desc)            \
3068 {                                                               \
3069     unsigned vl = simd_oprsz(desc);                             \
3070     unsigned idx = simd_data(desc);                             \
3071     unsigned elements = vl / ESIZE;                             \
3072     ARMVectorReg *d = vd;                                       \
3073     ARMPredicateReg *s = vs;                                    \
3074     if (idx == 0) {                                             \
3075         memset(d, 0, vl);                                       \
3076     }                                                           \
3077     for (unsigned e = 0; e < elements; ++e) {                   \
3078         depositn(d->d, elements * idx + e, 1, extractn(s->p, e * ESIZE, 1)); \
3079     }                                                           \
3080 }
3081 
3082 DO_PMOV_VP(pmov_vp_h, 2)
3083 DO_PMOV_VP(pmov_vp_s, 4)
3084 DO_PMOV_VP(pmov_vp_d, 8)
3085 
3086 #undef DO_PMOV_VP
3087 
3088 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
3089 
3090 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
3091                            bool is_tbx, tb_impl_fn *fn)
3092 {
3093     ARMVectorReg scratch;
3094     uintptr_t oprsz = simd_oprsz(desc);
3095 
3096     if (unlikely(vd == vn)) {
3097         vn = memcpy(&scratch, vn, oprsz);
3098     }
3099 
3100     fn(vd, vn, NULL, vm, oprsz, is_tbx);
3101 }
3102 
3103 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3104                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3105 {
3106     ARMVectorReg scratch;
3107     uintptr_t oprsz = simd_oprsz(desc);
3108 
3109     if (unlikely(vd == vn0)) {
3110         vn0 = memcpy(&scratch, vn0, oprsz);
3111         if (vd == vn1) {
3112             vn1 = vn0;
3113         }
3114     } else if (unlikely(vd == vn1)) {
3115         vn1 = memcpy(&scratch, vn1, oprsz);
3116     }
3117 
3118     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3119 }
3120 
3121 #define DO_TB(SUFF, TYPE, H)                                            \
3122 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
3123                                 void *vm, uintptr_t oprsz, bool is_tbx) \
3124 {                                                                       \
3125     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
3126     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
3127     for (i = 0; i < nelem; ++i) {                                       \
3128         TYPE index = indexes[H1(i)], val = 0;                           \
3129         if (index < nelem) {                                            \
3130             val = tbl0[H(index)];                                       \
3131         } else {                                                        \
3132             index -= nelem;                                             \
3133             if (tbl1 && index < nelem) {                                \
3134                 val = tbl1[H(index)];                                   \
3135             } else if (is_tbx) {                                        \
3136                 continue;                                               \
3137             }                                                           \
3138         }                                                               \
3139         d[H(i)] = val;                                                  \
3140     }                                                                   \
3141 }                                                                       \
3142 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3143 {                                                                       \
3144     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3145 }                                                                       \
3146 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3147                              void *vm, uint32_t desc)                   \
3148 {                                                                       \
3149     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3150 }                                                                       \
3151 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3152 {                                                                       \
3153     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3154 }
3155 
3156 DO_TB(b, uint8_t, H1)
3157 DO_TB(h, uint16_t, H2)
3158 DO_TB(s, uint32_t, H4)
3159 DO_TB(d, uint64_t, H8)
3160 
3161 #undef DO_TB
3162 
3163 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3164 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3165 {                                                              \
3166     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3167     TYPED *d = vd;                                             \
3168     TYPES *n = vn;                                             \
3169     ARMVectorReg tmp;                                          \
3170     if (unlikely(vn - vd < opr_sz)) {                          \
3171         n = memcpy(&tmp, n, opr_sz / 2);                       \
3172     }                                                          \
3173     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3174         d[HD(i)] = n[HS(i)];                                   \
3175     }                                                          \
3176 }
3177 
3178 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3179 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3180 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3181 
3182 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3183 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3184 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3185 
3186 #undef DO_UNPK
3187 
3188 /* Mask of bits included in the even numbered predicates of width esz.
3189  * We also use this for expand_bits/compress_bits, and so extend the
3190  * same pattern out to 16-bit units.
3191  */
3192 static const uint64_t even_bit_esz_masks[5] = {
3193     0x5555555555555555ull,
3194     0x3333333333333333ull,
3195     0x0f0f0f0f0f0f0f0full,
3196     0x00ff00ff00ff00ffull,
3197     0x0000ffff0000ffffull,
3198 };
3199 
3200 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3201  * For N==0, this corresponds to the operation that in qemu/bitops.h
3202  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3203  * section 7-2 Shuffling Bits.
3204  */
3205 static uint64_t expand_bits(uint64_t x, int n)
3206 {
3207     int i;
3208 
3209     x &= 0xffffffffu;
3210     for (i = 4; i >= n; i--) {
3211         int sh = 1 << i;
3212         x = ((x << sh) | x) & even_bit_esz_masks[i];
3213     }
3214     return x;
3215 }
3216 
3217 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3218  * For N==0, this corresponds to the operation that in qemu/bitops.h
3219  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3220  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3221  */
3222 static uint64_t compress_bits(uint64_t x, int n)
3223 {
3224     int i;
3225 
3226     for (i = n; i <= 4; i++) {
3227         int sh = 1 << i;
3228         x &= even_bit_esz_masks[i];
3229         x = (x >> sh) | x;
3230     }
3231     return x & 0xffffffffu;
3232 }
3233 
3234 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3235 {
3236     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3237     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3238     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3239     int esize = 1 << esz;
3240     uint64_t *d = vd;
3241     intptr_t i;
3242 
3243     if (oprsz <= 8) {
3244         uint64_t nn = *(uint64_t *)vn;
3245         uint64_t mm = *(uint64_t *)vm;
3246         int half = 4 * oprsz;
3247 
3248         nn = extract64(nn, high * half, half);
3249         mm = extract64(mm, high * half, half);
3250         nn = expand_bits(nn, esz);
3251         mm = expand_bits(mm, esz);
3252         d[0] = nn | (mm << esize);
3253     } else {
3254         ARMPredicateReg tmp;
3255 
3256         /* We produce output faster than we consume input.
3257            Therefore we must be mindful of possible overlap.  */
3258         if (vd == vn) {
3259             vn = memcpy(&tmp, vn, oprsz);
3260             if (vd == vm) {
3261                 vm = vn;
3262             }
3263         } else if (vd == vm) {
3264             vm = memcpy(&tmp, vm, oprsz);
3265         }
3266         if (high) {
3267             high = oprsz >> 1;
3268         }
3269 
3270         if ((oprsz & 7) == 0) {
3271             uint32_t *n = vn, *m = vm;
3272             high >>= 2;
3273 
3274             for (i = 0; i < oprsz / 8; i++) {
3275                 uint64_t nn = n[H4(high + i)];
3276                 uint64_t mm = m[H4(high + i)];
3277 
3278                 nn = expand_bits(nn, esz);
3279                 mm = expand_bits(mm, esz);
3280                 d[i] = nn | (mm << esize);
3281             }
3282         } else {
3283             uint8_t *n = vn, *m = vm;
3284             uint16_t *d16 = vd;
3285 
3286             for (i = 0; i < oprsz / 2; i++) {
3287                 uint16_t nn = n[H1(high + i)];
3288                 uint16_t mm = m[H1(high + i)];
3289 
3290                 nn = expand_bits(nn, esz);
3291                 mm = expand_bits(mm, esz);
3292                 d16[H2(i)] = nn | (mm << esize);
3293             }
3294         }
3295     }
3296 }
3297 
3298 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3299 {
3300     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3301     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3302     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3303     uint64_t *d = vd, *n = vn, *m = vm;
3304     uint64_t l, h;
3305     intptr_t i;
3306 
3307     if (oprsz <= 8) {
3308         l = compress_bits(n[0] >> odd, esz);
3309         h = compress_bits(m[0] >> odd, esz);
3310         d[0] = l | (h << (4 * oprsz));
3311     } else {
3312         ARMPredicateReg tmp_m;
3313         intptr_t oprsz_16 = oprsz / 16;
3314 
3315         if ((vm - vd) < (uintptr_t)oprsz) {
3316             m = memcpy(&tmp_m, vm, oprsz);
3317         }
3318 
3319         for (i = 0; i < oprsz_16; i++) {
3320             l = n[2 * i + 0];
3321             h = n[2 * i + 1];
3322             l = compress_bits(l >> odd, esz);
3323             h = compress_bits(h >> odd, esz);
3324             d[i] = l | (h << 32);
3325         }
3326 
3327         /*
3328          * For VL which is not a multiple of 512, the results from M do not
3329          * align nicely with the uint64_t for D.  Put the aligned results
3330          * from M into TMP_M and then copy it into place afterward.
3331          */
3332         if (oprsz & 15) {
3333             int final_shift = (oprsz & 15) * 2;
3334 
3335             l = n[2 * i + 0];
3336             h = n[2 * i + 1];
3337             l = compress_bits(l >> odd, esz);
3338             h = compress_bits(h >> odd, esz);
3339             d[i] = l | (h << final_shift);
3340 
3341             for (i = 0; i < oprsz_16; i++) {
3342                 l = m[2 * i + 0];
3343                 h = m[2 * i + 1];
3344                 l = compress_bits(l >> odd, esz);
3345                 h = compress_bits(h >> odd, esz);
3346                 tmp_m.p[i] = l | (h << 32);
3347             }
3348             l = m[2 * i + 0];
3349             h = m[2 * i + 1];
3350             l = compress_bits(l >> odd, esz);
3351             h = compress_bits(h >> odd, esz);
3352             tmp_m.p[i] = l | (h << final_shift);
3353 
3354             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3355         } else {
3356             for (i = 0; i < oprsz_16; i++) {
3357                 l = m[2 * i + 0];
3358                 h = m[2 * i + 1];
3359                 l = compress_bits(l >> odd, esz);
3360                 h = compress_bits(h >> odd, esz);
3361                 d[oprsz_16 + i] = l | (h << 32);
3362             }
3363         }
3364     }
3365 }
3366 
3367 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3368 {
3369     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3370     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3371     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3372     uint64_t *d = vd, *n = vn, *m = vm;
3373     uint64_t mask;
3374     int shr, shl;
3375     intptr_t i;
3376 
3377     shl = 1 << esz;
3378     shr = 0;
3379     mask = even_bit_esz_masks[esz];
3380     if (odd) {
3381         mask <<= shl;
3382         shr = shl;
3383         shl = 0;
3384     }
3385 
3386     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3387         uint64_t nn = (n[i] & mask) >> shr;
3388         uint64_t mm = (m[i] & mask) << shl;
3389         d[i] = nn + mm;
3390     }
3391 }
3392 
3393 /* Reverse units of 2**N bits.  */
3394 static uint64_t reverse_bits_64(uint64_t x, int n)
3395 {
3396     int i, sh;
3397 
3398     x = bswap64(x);
3399     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3400         uint64_t mask = even_bit_esz_masks[i];
3401         x = ((x & mask) << sh) | ((x >> sh) & mask);
3402     }
3403     return x;
3404 }
3405 
3406 static uint8_t reverse_bits_8(uint8_t x, int n)
3407 {
3408     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3409     int i, sh;
3410 
3411     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3412         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3413     }
3414     return x;
3415 }
3416 
3417 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3418 {
3419     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3420     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3421     intptr_t i, oprsz_2 = oprsz / 2;
3422 
3423     if (oprsz <= 8) {
3424         uint64_t l = *(uint64_t *)vn;
3425         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3426         *(uint64_t *)vd = l;
3427     } else if ((oprsz & 15) == 0) {
3428         for (i = 0; i < oprsz_2; i += 8) {
3429             intptr_t ih = oprsz - 8 - i;
3430             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3431             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3432             *(uint64_t *)(vd + i) = h;
3433             *(uint64_t *)(vd + ih) = l;
3434         }
3435     } else {
3436         for (i = 0; i < oprsz_2; i += 1) {
3437             intptr_t il = H1(i);
3438             intptr_t ih = H1(oprsz - 1 - i);
3439             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3440             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3441             *(uint8_t *)(vd + il) = h;
3442             *(uint8_t *)(vd + ih) = l;
3443         }
3444     }
3445 }
3446 
3447 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3448 {
3449     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3450     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3451     uint64_t *d = vd;
3452     intptr_t i;
3453 
3454     if (oprsz <= 8) {
3455         uint64_t nn = *(uint64_t *)vn;
3456         int half = 4 * oprsz;
3457 
3458         nn = extract64(nn, high * half, half);
3459         nn = expand_bits(nn, 0);
3460         d[0] = nn;
3461     } else {
3462         ARMPredicateReg tmp_n;
3463 
3464         /* We produce output faster than we consume input.
3465            Therefore we must be mindful of possible overlap.  */
3466         if ((vn - vd) < (uintptr_t)oprsz) {
3467             vn = memcpy(&tmp_n, vn, oprsz);
3468         }
3469         if (high) {
3470             high = oprsz >> 1;
3471         }
3472 
3473         if ((oprsz & 7) == 0) {
3474             uint32_t *n = vn;
3475             high >>= 2;
3476 
3477             for (i = 0; i < oprsz / 8; i++) {
3478                 uint64_t nn = n[H4(high + i)];
3479                 d[i] = expand_bits(nn, 0);
3480             }
3481         } else {
3482             uint16_t *d16 = vd;
3483             uint8_t *n = vn;
3484 
3485             for (i = 0; i < oprsz / 2; i++) {
3486                 uint16_t nn = n[H1(high + i)];
3487                 d16[H2(i)] = expand_bits(nn, 0);
3488             }
3489         }
3490     }
3491 }
3492 
3493 #define DO_ZIP(NAME, TYPE, H) \
3494 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3495 {                                                                    \
3496     intptr_t oprsz = simd_oprsz(desc);                               \
3497     intptr_t odd_ofs = simd_data(desc);                              \
3498     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3499     ARMVectorReg tmp_n, tmp_m;                                       \
3500     /* We produce output faster than we consume input.               \
3501        Therefore we must be mindful of possible overlap.  */         \
3502     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3503         vn = memcpy(&tmp_n, vn, oprsz);                              \
3504     }                                                                \
3505     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3506         vm = memcpy(&tmp_m, vm, oprsz);                              \
3507     }                                                                \
3508     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3509         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3510         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3511             *(TYPE *)(vm + odd_ofs + H(i));                          \
3512     }                                                                \
3513     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3514         memset(vd + oprsz - 16, 0, 16);                              \
3515     }                                                                \
3516 }
3517 
3518 DO_ZIP(sve_zip_b, uint8_t, H1)
3519 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3520 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3521 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3522 DO_ZIP(sve2_zip_q, Int128, )
3523 
3524 #define DO_UZP(NAME, TYPE, H) \
3525 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3526 {                                                                      \
3527     intptr_t oprsz = simd_oprsz(desc);                                 \
3528     intptr_t odd_ofs = simd_data(desc);                                \
3529     intptr_t i, p;                                                     \
3530     ARMVectorReg tmp_m;                                                \
3531     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3532         vm = memcpy(&tmp_m, vm, oprsz);                                \
3533     }                                                                  \
3534     i = 0, p = odd_ofs;                                                \
3535     do {                                                               \
3536         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3537         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3538     } while (p < oprsz);                                               \
3539     p -= oprsz;                                                        \
3540     do {                                                               \
3541         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3542         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3543     } while (p < oprsz);                                               \
3544     tcg_debug_assert(i == oprsz);                                      \
3545 }
3546 
3547 DO_UZP(sve_uzp_b, uint8_t, H1)
3548 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3549 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3550 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3551 DO_UZP(sve2_uzp_q, Int128, )
3552 
3553 typedef void perseg_zzz_fn(void *vd, void *vn, void *vm, uint32_t desc);
3554 
3555 static void do_perseg_zzz(void *vd, void *vn, void *vm,
3556                           uint32_t desc, perseg_zzz_fn *fn)
3557 {
3558     intptr_t oprsz = simd_oprsz(desc);
3559 
3560     desc = simd_desc(16, 16, simd_data(desc));
3561     for (intptr_t i = 0; i < oprsz; i += 16) {
3562         fn(vd + i, vn + i, vm + i, desc);
3563     }
3564 }
3565 
3566 #define DO_PERSEG_ZZZ(NAME, FUNC) \
3567     void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3568     { do_perseg_zzz(vd, vn, vm, desc, FUNC); }
3569 
3570 DO_PERSEG_ZZZ(sve2p1_uzpq_b, helper_sve_uzp_b)
3571 DO_PERSEG_ZZZ(sve2p1_uzpq_h, helper_sve_uzp_h)
3572 DO_PERSEG_ZZZ(sve2p1_uzpq_s, helper_sve_uzp_s)
3573 DO_PERSEG_ZZZ(sve2p1_uzpq_d, helper_sve_uzp_d)
3574 
3575 DO_PERSEG_ZZZ(sve2p1_zipq_b, helper_sve_zip_b)
3576 DO_PERSEG_ZZZ(sve2p1_zipq_h, helper_sve_zip_h)
3577 DO_PERSEG_ZZZ(sve2p1_zipq_s, helper_sve_zip_s)
3578 DO_PERSEG_ZZZ(sve2p1_zipq_d, helper_sve_zip_d)
3579 
3580 DO_PERSEG_ZZZ(sve2p1_tblq_b, helper_sve_tbl_b)
3581 DO_PERSEG_ZZZ(sve2p1_tblq_h, helper_sve_tbl_h)
3582 DO_PERSEG_ZZZ(sve2p1_tblq_s, helper_sve_tbl_s)
3583 DO_PERSEG_ZZZ(sve2p1_tblq_d, helper_sve_tbl_d)
3584 
3585 DO_PERSEG_ZZZ(sve2p1_tbxq_b, helper_sve2_tbx_b)
3586 DO_PERSEG_ZZZ(sve2p1_tbxq_h, helper_sve2_tbx_h)
3587 DO_PERSEG_ZZZ(sve2p1_tbxq_s, helper_sve2_tbx_s)
3588 DO_PERSEG_ZZZ(sve2p1_tbxq_d, helper_sve2_tbx_d)
3589 
3590 #undef DO_PERSEG_ZZZ
3591 
3592 #define DO_TRN(NAME, TYPE, H) \
3593 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3594 {                                                                      \
3595     intptr_t oprsz = simd_oprsz(desc);                                 \
3596     intptr_t odd_ofs = simd_data(desc);                                \
3597     intptr_t i;                                                        \
3598     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3599         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3600         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3601         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3602         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3603     }                                                                  \
3604     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3605         memset(vd + oprsz - 16, 0, 16);                                \
3606     }                                                                  \
3607 }
3608 
3609 DO_TRN(sve_trn_b, uint8_t, H1)
3610 DO_TRN(sve_trn_h, uint16_t, H1_2)
3611 DO_TRN(sve_trn_s, uint32_t, H1_4)
3612 DO_TRN(sve_trn_d, uint64_t, H1_8)
3613 DO_TRN(sve2_trn_q, Int128, )
3614 
3615 #undef DO_ZIP
3616 #undef DO_UZP
3617 #undef DO_TRN
3618 
3619 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3620 {
3621     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3622     uint32_t *d = vd, *n = vn;
3623     uint8_t *pg = vg;
3624 
3625     for (i = j = 0; i < opr_sz; i++) {
3626         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3627             d[H4(j)] = n[H4(i)];
3628             j++;
3629         }
3630     }
3631     for (; j < opr_sz; j++) {
3632         d[H4(j)] = 0;
3633     }
3634 }
3635 
3636 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3637 {
3638     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3639     uint64_t *d = vd, *n = vn;
3640     uint8_t *pg = vg;
3641 
3642     for (i = j = 0; i < opr_sz; i++) {
3643         if (pg[H1(i)] & 1) {
3644             d[j] = n[i];
3645             j++;
3646         }
3647     }
3648     for (; j < opr_sz; j++) {
3649         d[j] = 0;
3650     }
3651 }
3652 
3653 /* Similar to the ARM LastActiveElement pseudocode function, except the
3654  * result is multiplied by the element size.  This includes the not found
3655  * indication; e.g. not found for esz=3 is -8.
3656  */
3657 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3658 {
3659     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3660     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3661 
3662     return last_active_element(vg, words, esz);
3663 }
3664 
3665 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3666 {
3667     intptr_t opr_sz = simd_oprsz(desc) / 8;
3668     int esz = simd_data(desc);
3669     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3670     intptr_t i, first_i, last_i;
3671     ARMVectorReg tmp;
3672 
3673     first_i = last_i = 0;
3674     first_g = last_g = 0;
3675 
3676     /* Find the extent of the active elements within VG.  */
3677     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3678         pg = *(uint64_t *)(vg + i) & mask;
3679         if (pg) {
3680             if (last_g == 0) {
3681                 last_g = pg;
3682                 last_i = i;
3683             }
3684             first_g = pg;
3685             first_i = i;
3686         }
3687     }
3688 
3689     len = 0;
3690     if (first_g != 0) {
3691         first_i = first_i * 8 + ctz64(first_g);
3692         last_i = last_i * 8 + 63 - clz64(last_g);
3693         len = last_i - first_i + (1 << esz);
3694         if (vd == vm) {
3695             vm = memcpy(&tmp, vm, opr_sz * 8);
3696         }
3697         swap_memmove(vd, vn + first_i, len);
3698     }
3699     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3700 }
3701 
3702 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3703                             void *vg, uint32_t desc)
3704 {
3705     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3706     uint64_t *d = vd, *n = vn, *m = vm;
3707     uint8_t *pg = vg;
3708 
3709     for (i = 0; i < opr_sz; i += 1) {
3710         uint64_t nn = n[i], mm = m[i];
3711         uint64_t pp = expand_pred_b(pg[H1(i)]);
3712         d[i] = (nn & pp) | (mm & ~pp);
3713     }
3714 }
3715 
3716 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3717                             void *vg, uint32_t desc)
3718 {
3719     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3720     uint64_t *d = vd, *n = vn, *m = vm;
3721     uint8_t *pg = vg;
3722 
3723     for (i = 0; i < opr_sz; i += 1) {
3724         uint64_t nn = n[i], mm = m[i];
3725         uint64_t pp = expand_pred_h(pg[H1(i)]);
3726         d[i] = (nn & pp) | (mm & ~pp);
3727     }
3728 }
3729 
3730 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3731                             void *vg, uint32_t desc)
3732 {
3733     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3734     uint64_t *d = vd, *n = vn, *m = vm;
3735     uint8_t *pg = vg;
3736 
3737     for (i = 0; i < opr_sz; i += 1) {
3738         uint64_t nn = n[i], mm = m[i];
3739         uint64_t pp = expand_pred_s(pg[H1(i)]);
3740         d[i] = (nn & pp) | (mm & ~pp);
3741     }
3742 }
3743 
3744 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3745                             void *vg, uint32_t desc)
3746 {
3747     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3748     uint64_t *d = vd, *n = vn, *m = vm;
3749     uint8_t *pg = vg;
3750 
3751     for (i = 0; i < opr_sz; i += 1) {
3752         uint64_t nn = n[i], mm = m[i];
3753         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3754     }
3755 }
3756 
3757 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3758                             void *vg, uint32_t desc)
3759 {
3760     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3761     Int128 *d = vd, *n = vn, *m = vm;
3762     uint16_t *pg = vg;
3763 
3764     for (i = 0; i < opr_sz; i += 1) {
3765         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3766     }
3767 }
3768 
3769 /* Two operand comparison controlled by a predicate.
3770  * ??? It is very tempting to want to be able to expand this inline
3771  * with x86 instructions, e.g.
3772  *
3773  *    vcmpeqw    zm, zn, %ymm0
3774  *    vpmovmskb  %ymm0, %eax
3775  *    and        $0x5555, %eax
3776  *    and        pg, %eax
3777  *
3778  * or even aarch64, e.g.
3779  *
3780  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3781  *    cmeq       v0.8h, zn, zm
3782  *    and        v0.8h, v0.8h, mask
3783  *    addv       h0, v0.8h
3784  *    and        v0.8b, pg
3785  *
3786  * However, coming up with an abstraction that allows vector inputs and
3787  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3788  * scalar outputs, is tricky.
3789  */
3790 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3791 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3792 {                                                                            \
3793     intptr_t opr_sz = simd_oprsz(desc);                                      \
3794     uint32_t flags = PREDTEST_INIT;                                          \
3795     intptr_t i = opr_sz;                                                     \
3796     do {                                                                     \
3797         uint64_t out = 0, pg;                                                \
3798         do {                                                                 \
3799             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3800             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3801             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3802             out |= nn OP mm;                                                 \
3803         } while (i & 63);                                                    \
3804         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3805         out &= pg;                                                           \
3806         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3807         flags = iter_predtest_bwd(out, pg, flags);                           \
3808     } while (i > 0);                                                         \
3809     return flags;                                                            \
3810 }
3811 
3812 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3813     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3814 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3815     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3816 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3817     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3818 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3819     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3820 
3821 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3822 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3823 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3824 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3825 
3826 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3827 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3828 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3829 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3830 
3831 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3832 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3833 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3834 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3835 
3836 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3837 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3838 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3839 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3840 
3841 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3842 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3843 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3844 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3845 
3846 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3847 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3848 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3849 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3850 
3851 #undef DO_CMP_PPZZ_B
3852 #undef DO_CMP_PPZZ_H
3853 #undef DO_CMP_PPZZ_S
3854 #undef DO_CMP_PPZZ_D
3855 #undef DO_CMP_PPZZ
3856 
3857 /* Similar, but the second source is "wide".  */
3858 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3859 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3860 {                                                                            \
3861     intptr_t opr_sz = simd_oprsz(desc);                                      \
3862     uint32_t flags = PREDTEST_INIT;                                          \
3863     intptr_t i = opr_sz;                                                     \
3864     do {                                                                     \
3865         uint64_t out = 0, pg;                                                \
3866         do {                                                                 \
3867             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3868             do {                                                             \
3869                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3870                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3871                 out |= nn OP mm;                                             \
3872             } while (i & 7);                                                 \
3873         } while (i & 63);                                                    \
3874         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3875         out &= pg;                                                           \
3876         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3877         flags = iter_predtest_bwd(out, pg, flags);                           \
3878     } while (i > 0);                                                         \
3879     return flags;                                                            \
3880 }
3881 
3882 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3883     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3884 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3885     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3886 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3887     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3888 
3889 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3890 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3891 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3892 
3893 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3894 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3895 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3896 
3897 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3898 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3899 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3900 
3901 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3902 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3903 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3904 
3905 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3906 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3907 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3908 
3909 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3910 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3911 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3912 
3913 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3914 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3915 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3916 
3917 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3918 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3919 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3920 
3921 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3922 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3923 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3924 
3925 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3926 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3927 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3928 
3929 #undef DO_CMP_PPZW_B
3930 #undef DO_CMP_PPZW_H
3931 #undef DO_CMP_PPZW_S
3932 #undef DO_CMP_PPZW
3933 
3934 /* Similar, but the second source is immediate.  */
3935 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3936 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3937 {                                                                    \
3938     intptr_t opr_sz = simd_oprsz(desc);                              \
3939     uint32_t flags = PREDTEST_INIT;                                  \
3940     TYPE mm = simd_data(desc);                                       \
3941     intptr_t i = opr_sz;                                             \
3942     do {                                                             \
3943         uint64_t out = 0, pg;                                        \
3944         do {                                                         \
3945             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3946             TYPE nn = *(TYPE *)(vn + H(i));                          \
3947             out |= nn OP mm;                                         \
3948         } while (i & 63);                                            \
3949         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3950         out &= pg;                                                   \
3951         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3952         flags = iter_predtest_bwd(out, pg, flags);                   \
3953     } while (i > 0);                                                 \
3954     return flags;                                                    \
3955 }
3956 
3957 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3958     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3959 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3960     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3961 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3962     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3963 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3964     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3965 
3966 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3967 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3968 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3969 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3970 
3971 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3972 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3973 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3974 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3975 
3976 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3977 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3978 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3979 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3980 
3981 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3982 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3983 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3984 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3985 
3986 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3987 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3988 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3989 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3990 
3991 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3992 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3993 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3994 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3995 
3996 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3997 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3998 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3999 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
4000 
4001 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
4002 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
4003 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
4004 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
4005 
4006 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
4007 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
4008 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
4009 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
4010 
4011 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
4012 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
4013 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
4014 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
4015 
4016 #undef DO_CMP_PPZI_B
4017 #undef DO_CMP_PPZI_H
4018 #undef DO_CMP_PPZI_S
4019 #undef DO_CMP_PPZI_D
4020 #undef DO_CMP_PPZI
4021 
4022 /* Similar to the ARM LastActive pseudocode function.  */
4023 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
4024 {
4025     intptr_t i;
4026 
4027     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
4028         uint64_t pg = *(uint64_t *)(vg + i);
4029         if (pg) {
4030             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
4031         }
4032     }
4033     return 0;
4034 }
4035 
4036 /* Compute a mask into RETB that is true for all G, up to and including
4037  * (if after) or excluding (if !after) the first G & N.
4038  * Return true if BRK found.
4039  */
4040 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
4041                         bool brk, bool after)
4042 {
4043     uint64_t b;
4044 
4045     if (brk) {
4046         b = 0;
4047     } else if ((g & n) == 0) {
4048         /* For all G, no N are set; break not found.  */
4049         b = g;
4050     } else {
4051         /* Break somewhere in N.  Locate it.  */
4052         b = g & n;            /* guard true, pred true */
4053         b = b & -b;           /* first such */
4054         if (after) {
4055             b = b | (b - 1);  /* break after same */
4056         } else {
4057             b = b - 1;        /* break before same */
4058         }
4059         brk = true;
4060     }
4061 
4062     *retb = b;
4063     return brk;
4064 }
4065 
4066 /* Compute a zeroing BRK.  */
4067 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
4068                           intptr_t oprsz, bool after)
4069 {
4070     bool brk = false;
4071     intptr_t i;
4072 
4073     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4074         uint64_t this_b, this_g = g[i];
4075 
4076         brk = compute_brk(&this_b, n[i], this_g, brk, after);
4077         d[i] = this_b & this_g;
4078     }
4079 }
4080 
4081 /* Likewise, but also compute flags.  */
4082 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
4083                                intptr_t oprsz, bool after)
4084 {
4085     uint32_t flags = PREDTEST_INIT;
4086     bool brk = false;
4087     intptr_t i;
4088 
4089     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4090         uint64_t this_b, this_d, this_g = g[i];
4091 
4092         brk = compute_brk(&this_b, n[i], this_g, brk, after);
4093         d[i] = this_d = this_b & this_g;
4094         flags = iter_predtest_fwd(this_d, this_g, flags);
4095     }
4096     return flags;
4097 }
4098 
4099 /* Compute a merging BRK.  */
4100 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
4101                           intptr_t oprsz, bool after)
4102 {
4103     bool brk = false;
4104     intptr_t i;
4105 
4106     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
4107         uint64_t this_b, this_g = g[i];
4108 
4109         brk = compute_brk(&this_b, n[i], this_g, brk, after);
4110         d[i] = (this_b & this_g) | (d[i] & ~this_g);
4111     }
4112 }
4113 
4114 /* Likewise, but also compute flags.  */
4115 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
4116                                intptr_t oprsz, bool after)
4117 {
4118     uint32_t flags = PREDTEST_INIT;
4119     bool brk = false;
4120     intptr_t i;
4121 
4122     for (i = 0; i < oprsz / 8; ++i) {
4123         uint64_t this_b, this_d = d[i], this_g = g[i];
4124 
4125         brk = compute_brk(&this_b, n[i], this_g, brk, after);
4126         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
4127         flags = iter_predtest_fwd(this_d, this_g, flags);
4128     }
4129     return flags;
4130 }
4131 
4132 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4133                        uint32_t pred_desc)
4134 {
4135     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4136     if (last_active_pred(vn, vg, oprsz)) {
4137         compute_brk_z(vd, vm, vg, oprsz, true);
4138     } else {
4139         memset(vd, 0, sizeof(ARMPredicateReg));
4140     }
4141 }
4142 
4143 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4144                             uint32_t pred_desc)
4145 {
4146     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4147     if (last_active_pred(vn, vg, oprsz)) {
4148         return compute_brks_z(vd, vm, vg, oprsz, true);
4149     } else {
4150         memset(vd, 0, sizeof(ARMPredicateReg));
4151         return PREDTEST_INIT;
4152     }
4153 }
4154 
4155 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4156                        uint32_t pred_desc)
4157 {
4158     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4159     if (last_active_pred(vn, vg, oprsz)) {
4160         compute_brk_z(vd, vm, vg, oprsz, false);
4161     } else {
4162         memset(vd, 0, sizeof(ARMPredicateReg));
4163     }
4164 }
4165 
4166 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4167                             uint32_t pred_desc)
4168 {
4169     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4170     if (last_active_pred(vn, vg, oprsz)) {
4171         return compute_brks_z(vd, vm, vg, oprsz, false);
4172     } else {
4173         memset(vd, 0, sizeof(ARMPredicateReg));
4174         return PREDTEST_INIT;
4175     }
4176 }
4177 
4178 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4179 {
4180     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4181     compute_brk_z(vd, vn, vg, oprsz, true);
4182 }
4183 
4184 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4185 {
4186     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4187     return compute_brks_z(vd, vn, vg, oprsz, true);
4188 }
4189 
4190 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4191 {
4192     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4193     compute_brk_z(vd, vn, vg, oprsz, false);
4194 }
4195 
4196 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4197 {
4198     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4199     return compute_brks_z(vd, vn, vg, oprsz, false);
4200 }
4201 
4202 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4203 {
4204     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4205     compute_brk_m(vd, vn, vg, oprsz, true);
4206 }
4207 
4208 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4209 {
4210     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4211     return compute_brks_m(vd, vn, vg, oprsz, true);
4212 }
4213 
4214 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4215 {
4216     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4217     compute_brk_m(vd, vn, vg, oprsz, false);
4218 }
4219 
4220 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4221 {
4222     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4223     return compute_brks_m(vd, vn, vg, oprsz, false);
4224 }
4225 
4226 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4227 {
4228     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4229     if (!last_active_pred(vn, vg, oprsz)) {
4230         memset(vd, 0, sizeof(ARMPredicateReg));
4231     }
4232 }
4233 
4234 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4235 {
4236     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4237     if (last_active_pred(vn, vg, oprsz)) {
4238         ARMPredicateReg *d = vd;
4239         uint32_t flags = PREDTEST_INIT;
4240         intptr_t i;
4241 
4242         /* As if PredTest(Ones(PL), D, MO_8).  */
4243         for (i = 0; i < oprsz / 8; i++) {
4244             flags = iter_predtest_fwd(d->p[i], -1, flags);
4245         }
4246         if (oprsz & 7) {
4247             uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4248             flags = iter_predtest_fwd(d->p[i], mask, flags);
4249         }
4250         return flags;
4251     }
4252     memset(vd, 0, sizeof(ARMPredicateReg));
4253     return PREDTEST_INIT;
4254 }
4255 
4256 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4257 {
4258     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4259     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4260     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4261     intptr_t i;
4262 
4263     for (i = 0; i < words; ++i) {
4264         uint64_t t = n[i] & g[i] & mask;
4265         sum += ctpop64(t);
4266     }
4267     return sum;
4268 }
4269 
4270 uint64_t HELPER(sve2p1_cntp_c)(uint32_t png, uint32_t desc)
4271 {
4272     int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
4273     int vl = pl * 8;
4274     unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
4275     int lg2_width = FIELD_EX32(desc, PREDDESC, DATA) + 1;
4276     DecodeCounter p = decode_counter(png, vl, v_esz);
4277     unsigned maxelem = (vl << lg2_width) >> v_esz;
4278     unsigned count = p.count;
4279 
4280     if (p.invert) {
4281         if (count >= maxelem) {
4282             return 0;
4283         }
4284         count = maxelem - count;
4285     } else {
4286         count = MIN(count, maxelem);
4287     }
4288     return count >> p.lg2_stride;
4289 }
4290 
4291 /* C.f. Arm pseudocode EncodePredCount */
4292 static uint64_t encode_pred_count(uint32_t elements, uint32_t count,
4293                                   uint32_t esz, bool invert)
4294 {
4295     uint32_t pred;
4296 
4297     if (count == 0) {
4298         return 0;
4299     }
4300     if (invert) {
4301         count = elements - count;
4302     } else if (count == elements) {
4303         count = 0;
4304         invert = true;
4305     }
4306 
4307     pred = (count << 1) | 1;
4308     pred <<= esz;
4309     pred |= invert << 15;
4310 
4311     return pred;
4312 }
4313 
4314 /* C.f. Arm pseudocode PredCountTest */
4315 static uint32_t pred_count_test(uint32_t elements, uint32_t count, bool invert)
4316 {
4317     uint32_t flags;
4318 
4319     if (count == 0) {
4320         flags = 1;                              /* !N, Z, C */
4321     } else if (!invert) {
4322         flags = (1u << 31) | 2;                 /* N, !Z */
4323         flags |= count != elements;             /* C */
4324     } else {
4325         flags = 2;                              /* !Z, !C */
4326         flags |= (count == elements) << 31;     /* N */
4327     }
4328     return flags;
4329 }
4330 
4331 /* D must be cleared on entry. */
4332 static void do_whilel(ARMPredicateReg *d, uint64_t esz_mask,
4333                       uint32_t count, uint32_t oprbits)
4334 {
4335     tcg_debug_assert(count <= oprbits);
4336     if (count) {
4337         uint32_t i;
4338 
4339         /* Set all of the requested bits.  */
4340         for (i = 0; i < count / 64; ++i) {
4341             d->p[i] = esz_mask;
4342         }
4343         if (count & 63) {
4344             d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4345         }
4346     }
4347 }
4348 
4349 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4350 {
4351     uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4352     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4353     uint32_t oprbits = oprsz * 8;
4354     uint64_t esz_mask = pred_esz_masks[esz];
4355     ARMPredicateReg *d = vd;
4356 
4357     count <<= esz;
4358     memset(d, 0, sizeof(*d));
4359     do_whilel(d, esz_mask, count, oprbits);
4360     return pred_count_test(oprbits, count, false);
4361 }
4362 
4363 uint32_t HELPER(sve_while2l)(void *vd, uint32_t count, uint32_t pred_desc)
4364 {
4365     uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4366     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4367     uint32_t oprbits = oprsz * 8;
4368     uint64_t esz_mask = pred_esz_masks[esz];
4369     ARMPredicateReg *d = vd;
4370 
4371     count <<= esz;
4372     memset(d, 0, 2 * sizeof(*d));
4373     if (count <= oprbits) {
4374         do_whilel(&d[0], esz_mask, count, oprbits);
4375     } else {
4376         do_whilel(&d[0], esz_mask, oprbits, oprbits);
4377         do_whilel(&d[1], esz_mask, count - oprbits, oprbits);
4378     }
4379 
4380     return pred_count_test(2 * oprbits, count, false);
4381 }
4382 
4383 uint32_t HELPER(sve_whilecl)(void *vd, uint32_t count, uint32_t pred_desc)
4384 {
4385     uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4386     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4387     uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
4388     uint32_t vl = pl * 8;
4389     uint32_t elements = (vl >> esz) << scale;
4390     ARMPredicateReg *d = vd;
4391 
4392     *d = (ARMPredicateReg) {
4393         .p[0] = encode_pred_count(elements, count, esz, false)
4394     };
4395     return pred_count_test(elements, count, false);
4396 }
4397 
4398 /* D must be cleared on entry. */
4399 static void do_whileg(ARMPredicateReg *d, uint64_t esz_mask,
4400                       uint32_t count, uint32_t oprbits)
4401 {
4402     tcg_debug_assert(count <= oprbits);
4403     if (count) {
4404         uint32_t i, invcount = oprbits - count;
4405         uint64_t bits = esz_mask & MAKE_64BIT_MASK(invcount & 63, 64);
4406 
4407         for (i = invcount / 64; i < oprbits / 64; ++i) {
4408             d->p[i] = bits;
4409             bits = esz_mask;
4410         }
4411         if (oprbits & 63) {
4412             d->p[i] = bits & MAKE_64BIT_MASK(0, oprbits & 63);
4413         }
4414     }
4415 }
4416 
4417 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4418 {
4419     uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4420     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4421     uint32_t oprbits = oprsz * 8;
4422     uint64_t esz_mask = pred_esz_masks[esz];
4423     ARMPredicateReg *d = vd;
4424 
4425     count <<= esz;
4426     memset(d, 0, sizeof(*d));
4427     do_whileg(d, esz_mask, count, oprbits);
4428     return pred_count_test(oprbits, count, true);
4429 }
4430 
4431 uint32_t HELPER(sve_while2g)(void *vd, uint32_t count, uint32_t pred_desc)
4432 {
4433     uint32_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4434     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4435     uint32_t oprbits = oprsz * 8;
4436     uint64_t esz_mask = pred_esz_masks[esz];
4437     ARMPredicateReg *d = vd;
4438 
4439     count <<= esz;
4440     memset(d, 0, 2 * sizeof(*d));
4441     if (count <= oprbits) {
4442         do_whileg(&d[1], esz_mask, count, oprbits);
4443     } else {
4444         do_whilel(&d[1], esz_mask, oprbits, oprbits);
4445         do_whileg(&d[0], esz_mask, count - oprbits, oprbits);
4446     }
4447 
4448     return pred_count_test(2 * oprbits, count, true);
4449 }
4450 
4451 uint32_t HELPER(sve_whilecg)(void *vd, uint32_t count, uint32_t pred_desc)
4452 {
4453     uint32_t pl = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4454     uint32_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4455     uint32_t scale = FIELD_EX32(pred_desc, PREDDESC, DATA);
4456     uint32_t vl = pl * 8;
4457     uint32_t elements = (vl >> esz) << scale;
4458     ARMPredicateReg *d = vd;
4459 
4460     *d = (ARMPredicateReg) {
4461         .p[0] = encode_pred_count(elements, count, esz, true)
4462     };
4463     return pred_count_test(elements, count, true);
4464 }
4465 
4466 /* Recursive reduction on a function;
4467  * C.f. the ARM ARM function ReducePredicated.
4468  *
4469  * While it would be possible to write this without the DATA temporary,
4470  * it is much simpler to process the predicate register this way.
4471  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4472  * little to gain with a more complex non-recursive form.
4473  */
4474 #define DO_REDUCE(NAME, SUF, TYPE, H, FUNC, IDENT)                      \
4475 static TYPE FUNC##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4476 {                                                                     \
4477     if (n == 1) {                                                     \
4478         return *data;                                                 \
4479     } else {                                                          \
4480         uintptr_t half = n / 2;                                       \
4481         TYPE lo = FUNC##_reduce(data, status, half);                  \
4482         TYPE hi = FUNC##_reduce(data + half, status, half);           \
4483         return FUNC(lo, hi, status);                                  \
4484     }                                                                 \
4485 }                                                                     \
4486 uint64_t helper_sve_##NAME##v_##SUF(void *vn, void *vg,               \
4487                                     float_status *s, uint32_t desc)   \
4488 {                                                                     \
4489     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4490     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4491     for (i = 0; i < oprsz; ) {                                        \
4492         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4493         do {                                                          \
4494             TYPE nn = *(TYPE *)(vn + H(i));                           \
4495             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4496             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4497         } while (i & 15);                                             \
4498     }                                                                 \
4499     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4500         *(TYPE *)((void *)data + i) = IDENT;                          \
4501     }                                                                 \
4502     return FUNC##_reduce(data, s, maxsz / sizeof(TYPE));              \
4503 }                                                                     \
4504 void helper_sve2p1_##NAME##qv_##SUF(void *vd, void *vn, void *vg,     \
4505                                     float_status *status, uint32_t desc) \
4506 {                                                                     \
4507     unsigned oprsz = simd_oprsz(desc), segments = oprsz / 16;         \
4508     for (unsigned e = 0; e < 16; e += sizeof(TYPE)) {                 \
4509         TYPE data[ARM_MAX_VQ];                                        \
4510         for (unsigned s = 0; s < segments; s++) {                     \
4511             uint16_t pg = *(uint16_t *)(vg + H1_2(s * 2));            \
4512             TYPE nn = *(TYPE *)(vn + H(s * 16 + H(e)));               \
4513             data[s] = (pg >> e) & 1 ? nn : IDENT;                     \
4514         }                                                             \
4515         *(TYPE *)(vd + H(e)) = FUNC##_reduce(data, status, segments); \
4516     }                                                                 \
4517     clear_tail(vd, 16, simd_maxsz(desc));                             \
4518 }
4519 
4520 DO_REDUCE(fadd,h, float16, H1_2, float16_add, float16_zero)
4521 DO_REDUCE(fadd,s, float32, H1_4, float32_add, float32_zero)
4522 DO_REDUCE(fadd,d, float64, H1_8, float64_add, float64_zero)
4523 
4524 /* Identity is floatN_default_nan, without the function call.  */
4525 DO_REDUCE(fminnm,h, float16, H1_2, float16_minnum, 0x7E00)
4526 DO_REDUCE(fminnm,s, float32, H1_4, float32_minnum, 0x7FC00000)
4527 DO_REDUCE(fminnm,d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
4528 
4529 DO_REDUCE(fmaxnm,h, float16, H1_2, float16_maxnum, 0x7E00)
4530 DO_REDUCE(fmaxnm,s, float32, H1_4, float32_maxnum, 0x7FC00000)
4531 DO_REDUCE(fmaxnm,d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
4532 
4533 DO_REDUCE(fmin,h, float16, H1_2, float16_min, float16_infinity)
4534 DO_REDUCE(fmin,s, float32, H1_4, float32_min, float32_infinity)
4535 DO_REDUCE(fmin,d, float64, H1_8, float64_min, float64_infinity)
4536 
4537 DO_REDUCE(fmax,h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4538 DO_REDUCE(fmax,s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4539 DO_REDUCE(fmax,d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4540 
4541 DO_REDUCE(ah_fmin,h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4542 DO_REDUCE(ah_fmin,s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4543 DO_REDUCE(ah_fmin,d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4544 
4545 DO_REDUCE(ah_fmax,h, float16, H1_2, helper_vfp_ah_maxh,
4546           float16_chs(float16_infinity))
4547 DO_REDUCE(ah_fmax,s, float32, H1_4, helper_vfp_ah_maxs,
4548           float32_chs(float32_infinity))
4549 DO_REDUCE(ah_fmax,d, float64, H1_8, helper_vfp_ah_maxd,
4550           float64_chs(float64_infinity))
4551 
4552 #undef DO_REDUCE
4553 
4554 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4555                              float_status *status, uint32_t desc)
4556 {
4557     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4558     float16 result = nn;
4559 
4560     do {
4561         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4562         do {
4563             if (pg & 1) {
4564                 float16 mm = *(float16 *)(vm + H1_2(i));
4565                 result = float16_add(result, mm, status);
4566             }
4567             i += sizeof(float16), pg >>= sizeof(float16);
4568         } while (i & 15);
4569     } while (i < opr_sz);
4570 
4571     return result;
4572 }
4573 
4574 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4575                              float_status *status, uint32_t desc)
4576 {
4577     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4578     float32 result = nn;
4579 
4580     do {
4581         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4582         do {
4583             if (pg & 1) {
4584                 float32 mm = *(float32 *)(vm + H1_2(i));
4585                 result = float32_add(result, mm, status);
4586             }
4587             i += sizeof(float32), pg >>= sizeof(float32);
4588         } while (i & 15);
4589     } while (i < opr_sz);
4590 
4591     return result;
4592 }
4593 
4594 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4595                              float_status *status, uint32_t desc)
4596 {
4597     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4598     uint64_t *m = vm;
4599     uint8_t *pg = vg;
4600 
4601     for (i = 0; i < opr_sz; i++) {
4602         if (pg[H1(i)] & 1) {
4603             nn = float64_add(nn, m[i], status);
4604         }
4605     }
4606 
4607     return nn;
4608 }
4609 
4610 /* Fully general three-operand expander, controlled by a predicate,
4611  * With the extra float_status parameter.
4612  */
4613 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4614 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4615                   float_status *status, uint32_t desc)          \
4616 {                                                               \
4617     intptr_t i = simd_oprsz(desc);                              \
4618     uint64_t *g = vg;                                           \
4619     do {                                                        \
4620         uint64_t pg = g[(i - 1) >> 6];                          \
4621         do {                                                    \
4622             i -= sizeof(TYPE);                                  \
4623             if (likely((pg >> (i & 63)) & 1)) {                 \
4624                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4625                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4626                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4627             }                                                   \
4628         } while (i & 63);                                       \
4629     } while (i != 0);                                           \
4630 }
4631 
4632 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4633 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4634 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4635 
4636 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4637 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4638 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4639 
4640 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4641 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4642 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4643 
4644 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4645 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4646 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4647 
4648 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4649 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4650 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4651 
4652 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4653 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4654 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4655 
4656 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4657 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4658 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4659 
4660 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4661 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4662 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4663 
4664 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4665 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4666 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4667 
4668 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4669 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4670 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4671 
4672 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4673 {
4674     return float16_abs(float16_sub(a, b, s));
4675 }
4676 
4677 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4678 {
4679     return float32_abs(float32_sub(a, b, s));
4680 }
4681 
4682 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4683 {
4684     return float64_abs(float64_sub(a, b, s));
4685 }
4686 
4687 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
4688 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4689 {
4690     float16 r = float16_sub(op1, op2, stat);
4691     return float16_is_any_nan(r) ? r : float16_abs(r);
4692 }
4693 
4694 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4695 {
4696     float32 r = float32_sub(op1, op2, stat);
4697     return float32_is_any_nan(r) ? r : float32_abs(r);
4698 }
4699 
4700 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4701 {
4702     float64 r = float64_sub(op1, op2, stat);
4703     return float64_is_any_nan(r) ? r : float64_abs(r);
4704 }
4705 
4706 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4707 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4708 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4709 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4710 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4711 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4712 
4713 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4714 {
4715     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4716     return float64_scalbn(a, b_int, s);
4717 }
4718 
4719 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4720 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4721 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4722 
4723 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4724 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4725 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4726 
4727 #undef DO_ZPZZ_FP
4728 
4729 /* Three-operand expander, with one scalar operand, controlled by
4730  * a predicate, with the extra float_status parameter.
4731  */
4732 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4733 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4734                   float_status *status, uint32_t desc)            \
4735 {                                                                 \
4736     intptr_t i = simd_oprsz(desc);                                \
4737     uint64_t *g = vg;                                             \
4738     TYPE mm = scalar;                                             \
4739     do {                                                          \
4740         uint64_t pg = g[(i - 1) >> 6];                            \
4741         do {                                                      \
4742             i -= sizeof(TYPE);                                    \
4743             if (likely((pg >> (i & 63)) & 1)) {                   \
4744                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4745                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4746             }                                                     \
4747         } while (i & 63);                                         \
4748     } while (i != 0);                                             \
4749 }
4750 
4751 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4752 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4753 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4754 
4755 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4756 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4757 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4758 
4759 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4760 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4761 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4762 
4763 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4764 {
4765     return float16_sub(b, a, s);
4766 }
4767 
4768 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4769 {
4770     return float32_sub(b, a, s);
4771 }
4772 
4773 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4774 {
4775     return float64_sub(b, a, s);
4776 }
4777 
4778 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4779 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4780 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4781 
4782 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4783 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4784 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4785 
4786 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4787 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4788 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4789 
4790 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4791 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4792 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4793 
4794 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4795 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4796 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4797 
4798 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4799 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4800 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4801 
4802 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4803 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4804 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4805 
4806 /* Fully general two-operand expander, controlled by a predicate,
4807  * With the extra float_status parameter.
4808  */
4809 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4810 void HELPER(NAME)(void *vd, void *vn, void *vg,                       \
4811                   float_status *status, uint32_t desc)                \
4812 {                                                                     \
4813     intptr_t i = simd_oprsz(desc);                                    \
4814     uint64_t *g = vg;                                                 \
4815     do {                                                              \
4816         uint64_t pg = g[(i - 1) >> 6];                                \
4817         do {                                                          \
4818             i -= sizeof(TYPE);                                        \
4819             if (likely((pg >> (i & 63)) & 1)) {                       \
4820                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4821                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4822             }                                                         \
4823         } while (i & 63);                                             \
4824     } while (i != 0);                                                 \
4825 }
4826 
4827 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4828  * FZ16.  When converting from fp16, this affects flushing input denormals;
4829  * when converting to fp16, this affects flushing output denormals.
4830  */
4831 float32 sve_f16_to_f32(float16 f, float_status *fpst)
4832 {
4833     bool save = get_flush_inputs_to_zero(fpst);
4834     float32 ret;
4835 
4836     set_flush_inputs_to_zero(false, fpst);
4837     ret = float16_to_float32(f, true, fpst);
4838     set_flush_inputs_to_zero(save, fpst);
4839     return ret;
4840 }
4841 
4842 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4843 {
4844     bool save = get_flush_inputs_to_zero(fpst);
4845     float64 ret;
4846 
4847     set_flush_inputs_to_zero(false, fpst);
4848     ret = float16_to_float64(f, true, fpst);
4849     set_flush_inputs_to_zero(save, fpst);
4850     return ret;
4851 }
4852 
4853 float16 sve_f32_to_f16(float32 f, float_status *fpst)
4854 {
4855     bool save = get_flush_to_zero(fpst);
4856     float16 ret;
4857 
4858     set_flush_to_zero(false, fpst);
4859     ret = float32_to_float16(f, true, fpst);
4860     set_flush_to_zero(save, fpst);
4861     return ret;
4862 }
4863 
4864 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4865 {
4866     bool save = get_flush_to_zero(fpst);
4867     float16 ret;
4868 
4869     set_flush_to_zero(false, fpst);
4870     ret = float64_to_float16(f, true, fpst);
4871     set_flush_to_zero(save, fpst);
4872     return ret;
4873 }
4874 
4875 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4876 {
4877     if (float16_is_any_nan(f)) {
4878         float_raise(float_flag_invalid, s);
4879         return 0;
4880     }
4881     return float16_to_int16_round_to_zero(f, s);
4882 }
4883 
4884 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4885 {
4886     if (float16_is_any_nan(f)) {
4887         float_raise(float_flag_invalid, s);
4888         return 0;
4889     }
4890     return float16_to_int64_round_to_zero(f, s);
4891 }
4892 
4893 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4894 {
4895     if (float32_is_any_nan(f)) {
4896         float_raise(float_flag_invalid, s);
4897         return 0;
4898     }
4899     return float32_to_int64_round_to_zero(f, s);
4900 }
4901 
4902 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4903 {
4904     if (float64_is_any_nan(f)) {
4905         float_raise(float_flag_invalid, s);
4906         return 0;
4907     }
4908     return float64_to_int64_round_to_zero(f, s);
4909 }
4910 
4911 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4912 {
4913     if (float16_is_any_nan(f)) {
4914         float_raise(float_flag_invalid, s);
4915         return 0;
4916     }
4917     return float16_to_uint16_round_to_zero(f, s);
4918 }
4919 
4920 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4921 {
4922     if (float16_is_any_nan(f)) {
4923         float_raise(float_flag_invalid, s);
4924         return 0;
4925     }
4926     return float16_to_uint64_round_to_zero(f, s);
4927 }
4928 
4929 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4930 {
4931     if (float32_is_any_nan(f)) {
4932         float_raise(float_flag_invalid, s);
4933         return 0;
4934     }
4935     return float32_to_uint64_round_to_zero(f, s);
4936 }
4937 
4938 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4939 {
4940     if (float64_is_any_nan(f)) {
4941         float_raise(float_flag_invalid, s);
4942         return 0;
4943     }
4944     return float64_to_uint64_round_to_zero(f, s);
4945 }
4946 
4947 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4948 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4949 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4950 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4951 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4952 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4953 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4954 
4955 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4956 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4957 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4958 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4959 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4960 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4961 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4962 
4963 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4964 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4965 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4966 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4967 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4968 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4969 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4970 
4971 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4972 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4973 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4974 
4975 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4976 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4977 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4978 
4979 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4980 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4981 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4982 
4983 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4984 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4985 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4986 
4987 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4988 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4989 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4990 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4991 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4992 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4993 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4994 
4995 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4996 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4997 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4998 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4999 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
5000 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
5001 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
5002 
5003 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
5004 {
5005     /* Extract frac to the top of the uint32_t. */
5006     uint32_t frac = (uint32_t)a << (16 + 6);
5007     int16_t exp = extract32(a, 10, 5);
5008 
5009     if (unlikely(exp == 0)) {
5010         if (frac != 0) {
5011             if (!get_flush_inputs_to_zero(s)) {
5012                 /* denormal: bias - fractional_zeros */
5013                 return -15 - clz32(frac);
5014             }
5015             /* flush to zero */
5016             float_raise(float_flag_input_denormal_flushed, s);
5017         }
5018     } else if (unlikely(exp == 0x1f)) {
5019         if (frac == 0) {
5020             return INT16_MAX; /* infinity */
5021         }
5022     } else {
5023         /* normal: exp - bias */
5024         return exp - 15;
5025     }
5026     /* nan or zero */
5027     float_raise(float_flag_invalid, s);
5028     return INT16_MIN;
5029 }
5030 
5031 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
5032 {
5033     /* Extract frac to the top of the uint32_t. */
5034     uint32_t frac = a << 9;
5035     int32_t exp = extract32(a, 23, 8);
5036 
5037     if (unlikely(exp == 0)) {
5038         if (frac != 0) {
5039             if (!get_flush_inputs_to_zero(s)) {
5040                 /* denormal: bias - fractional_zeros */
5041                 return -127 - clz32(frac);
5042             }
5043             /* flush to zero */
5044             float_raise(float_flag_input_denormal_flushed, s);
5045         }
5046     } else if (unlikely(exp == 0xff)) {
5047         if (frac == 0) {
5048             return INT32_MAX; /* infinity */
5049         }
5050     } else {
5051         /* normal: exp - bias */
5052         return exp - 127;
5053     }
5054     /* nan or zero */
5055     float_raise(float_flag_invalid, s);
5056     return INT32_MIN;
5057 }
5058 
5059 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
5060 {
5061     /* Extract frac to the top of the uint64_t. */
5062     uint64_t frac = a << 12;
5063     int64_t exp = extract64(a, 52, 11);
5064 
5065     if (unlikely(exp == 0)) {
5066         if (frac != 0) {
5067             if (!get_flush_inputs_to_zero(s)) {
5068                 /* denormal: bias - fractional_zeros */
5069                 return -1023 - clz64(frac);
5070             }
5071             /* flush to zero */
5072             float_raise(float_flag_input_denormal_flushed, s);
5073         }
5074     } else if (unlikely(exp == 0x7ff)) {
5075         if (frac == 0) {
5076             return INT64_MAX; /* infinity */
5077         }
5078     } else {
5079         /* normal: exp - bias */
5080         return exp - 1023;
5081     }
5082     /* nan or zero */
5083     float_raise(float_flag_invalid, s);
5084     return INT64_MIN;
5085 }
5086 
5087 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
5088 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
5089 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
5090 
5091 #undef DO_ZPZ_FP
5092 
5093 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
5094                             float_status *status, uint32_t desc,
5095                             uint16_t neg1, uint16_t neg3, int flags)
5096 {
5097     intptr_t i = simd_oprsz(desc);
5098     uint64_t *g = vg;
5099 
5100     do {
5101         uint64_t pg = g[(i - 1) >> 6];
5102         do {
5103             i -= 2;
5104             if (likely((pg >> (i & 63)) & 1)) {
5105                 float16 e1, e2, e3, r;
5106 
5107                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
5108                 e2 = *(uint16_t *)(vm + H1_2(i));
5109                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
5110                 r = float16_muladd(e1, e2, e3, flags, status);
5111                 *(uint16_t *)(vd + H1_2(i)) = r;
5112             }
5113         } while (i & 63);
5114     } while (i != 0);
5115 }
5116 
5117 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5118                               void *vg, float_status *status, uint32_t desc)
5119 {
5120     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5121 }
5122 
5123 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5124                               void *vg, float_status *status, uint32_t desc)
5125 {
5126     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
5127 }
5128 
5129 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5130                                void *vg, float_status *status, uint32_t desc)
5131 {
5132     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
5133 }
5134 
5135 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5136                                void *vg, float_status *status, uint32_t desc)
5137 {
5138     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
5139 }
5140 
5141 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5142                               void *vg, float_status *status, uint32_t desc)
5143 {
5144     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5145                     float_muladd_negate_product);
5146 }
5147 
5148 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5149                                void *vg, float_status *status, uint32_t desc)
5150 {
5151     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5152                     float_muladd_negate_product | float_muladd_negate_c);
5153 }
5154 
5155 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5156                                void *vg, float_status *status, uint32_t desc)
5157 {
5158     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
5159                     float_muladd_negate_c);
5160 }
5161 
5162 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
5163                             float_status *status, uint32_t desc,
5164                             uint32_t neg1, uint32_t neg3, int flags)
5165 {
5166     intptr_t i = simd_oprsz(desc);
5167     uint64_t *g = vg;
5168 
5169     do {
5170         uint64_t pg = g[(i - 1) >> 6];
5171         do {
5172             i -= 4;
5173             if (likely((pg >> (i & 63)) & 1)) {
5174                 float32 e1, e2, e3, r;
5175 
5176                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
5177                 e2 = *(uint32_t *)(vm + H1_4(i));
5178                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
5179                 r = float32_muladd(e1, e2, e3, flags, status);
5180                 *(uint32_t *)(vd + H1_4(i)) = r;
5181             }
5182         } while (i & 63);
5183     } while (i != 0);
5184 }
5185 
5186 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5187                               void *vg, float_status *status, uint32_t desc)
5188 {
5189     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5190 }
5191 
5192 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5193                               void *vg, float_status *status, uint32_t desc)
5194 {
5195     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
5196 }
5197 
5198 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5199                                void *vg, float_status *status, uint32_t desc)
5200 {
5201     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
5202 }
5203 
5204 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5205                                void *vg, float_status *status, uint32_t desc)
5206 {
5207     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
5208 }
5209 
5210 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5211                               void *vg, float_status *status, uint32_t desc)
5212 {
5213     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5214                     float_muladd_negate_product);
5215 }
5216 
5217 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5218                                void *vg, float_status *status, uint32_t desc)
5219 {
5220     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5221                     float_muladd_negate_product | float_muladd_negate_c);
5222 }
5223 
5224 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5225                                void *vg, float_status *status, uint32_t desc)
5226 {
5227     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
5228                     float_muladd_negate_c);
5229 }
5230 
5231 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
5232                             float_status *status, uint32_t desc,
5233                             uint64_t neg1, uint64_t neg3, int flags)
5234 {
5235     intptr_t i = simd_oprsz(desc);
5236     uint64_t *g = vg;
5237 
5238     do {
5239         uint64_t pg = g[(i - 1) >> 6];
5240         do {
5241             i -= 8;
5242             if (likely((pg >> (i & 63)) & 1)) {
5243                 float64 e1, e2, e3, r;
5244 
5245                 e1 = *(uint64_t *)(vn + i) ^ neg1;
5246                 e2 = *(uint64_t *)(vm + i);
5247                 e3 = *(uint64_t *)(va + i) ^ neg3;
5248                 r = float64_muladd(e1, e2, e3, flags, status);
5249                 *(uint64_t *)(vd + i) = r;
5250             }
5251         } while (i & 63);
5252     } while (i != 0);
5253 }
5254 
5255 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5256                               void *vg, float_status *status, uint32_t desc)
5257 {
5258     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
5259 }
5260 
5261 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5262                               void *vg, float_status *status, uint32_t desc)
5263 {
5264     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
5265 }
5266 
5267 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5268                                void *vg, float_status *status, uint32_t desc)
5269 {
5270     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
5271 }
5272 
5273 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5274                                void *vg, float_status *status, uint32_t desc)
5275 {
5276     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
5277 }
5278 
5279 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5280                               void *vg, float_status *status, uint32_t desc)
5281 {
5282     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5283                     float_muladd_negate_product);
5284 }
5285 
5286 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5287                                void *vg, float_status *status, uint32_t desc)
5288 {
5289     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5290                     float_muladd_negate_product | float_muladd_negate_c);
5291 }
5292 
5293 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5294                                void *vg, float_status *status, uint32_t desc)
5295 {
5296     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5297                     float_muladd_negate_c);
5298 }
5299 
5300 /* Two operand floating-point comparison controlled by a predicate.
5301  * Unlike the integer version, we are not allowed to optimistically
5302  * compare operands, since the comparison may have side effects wrt
5303  * the FPSR.
5304  */
5305 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
5306 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
5307                   float_status *status, uint32_t desc)                  \
5308 {                                                                       \
5309     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
5310     uint64_t *d = vd, *g = vg;                                          \
5311     do {                                                                \
5312         uint64_t out = 0, pg = g[j];                                    \
5313         do {                                                            \
5314             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
5315             if (likely((pg >> (i & 63)) & 1)) {                         \
5316                 TYPE nn = *(TYPE *)(vn + H(i));                         \
5317                 TYPE mm = *(TYPE *)(vm + H(i));                         \
5318                 out |= OP(TYPE, nn, mm, status);                        \
5319             }                                                           \
5320         } while (i & 63);                                               \
5321         d[j--] = out;                                                   \
5322     } while (i > 0);                                                    \
5323 }
5324 
5325 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5326     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5327 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5328     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5329 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5330     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5331 
5332 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5333     DO_FPCMP_PPZZ_H(NAME, OP)   \
5334     DO_FPCMP_PPZZ_S(NAME, OP)   \
5335     DO_FPCMP_PPZZ_D(NAME, OP)
5336 
5337 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
5338 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
5339 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
5340 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
5341 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
5342 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
5343 #define DO_FCMUO(TYPE, X, Y, ST)  \
5344     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5345 #define DO_FACGE(TYPE, X, Y, ST)  \
5346     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5347 #define DO_FACGT(TYPE, X, Y, ST)  \
5348     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5349 
5350 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5351 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5352 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5353 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5354 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5355 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5356 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5357 
5358 #undef DO_FPCMP_PPZZ_ALL
5359 #undef DO_FPCMP_PPZZ_D
5360 #undef DO_FPCMP_PPZZ_S
5361 #undef DO_FPCMP_PPZZ_H
5362 #undef DO_FPCMP_PPZZ
5363 
5364 /* One operand floating-point comparison against zero, controlled
5365  * by a predicate.
5366  */
5367 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
5368 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
5369                   float_status *status, uint32_t desc)     \
5370 {                                                          \
5371     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
5372     uint64_t *d = vd, *g = vg;                             \
5373     do {                                                   \
5374         uint64_t out = 0, pg = g[j];                       \
5375         do {                                               \
5376             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
5377             if ((pg >> (i & 63)) & 1) {                    \
5378                 TYPE nn = *(TYPE *)(vn + H(i));            \
5379                 out |= OP(TYPE, nn, 0, status);            \
5380             }                                              \
5381         } while (i & 63);                                  \
5382         d[j--] = out;                                      \
5383     } while (i > 0);                                       \
5384 }
5385 
5386 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5387     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5388 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5389     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5390 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5391     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5392 
5393 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5394     DO_FPCMP_PPZ0_H(NAME, OP)   \
5395     DO_FPCMP_PPZ0_S(NAME, OP)   \
5396     DO_FPCMP_PPZ0_D(NAME, OP)
5397 
5398 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5399 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5400 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5401 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5402 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5403 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5404 
5405 /* FP Trig Multiply-Add. */
5406 
5407 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5408                          float_status *s, uint32_t desc)
5409 {
5410     static const float16 coeff[16] = {
5411         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5412         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5413     };
5414     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5415     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5416     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5417     float16 *d = vd, *n = vn, *m = vm;
5418 
5419     for (i = 0; i < opr_sz; i++) {
5420         float16 mm = m[i];
5421         intptr_t xx = x;
5422         int flags = 0;
5423 
5424         if (float16_is_neg(mm)) {
5425             if (fpcr_ah) {
5426                 flags = float_muladd_negate_product;
5427             } else {
5428                 mm = float16_abs(mm);
5429             }
5430             xx += 8;
5431         }
5432         d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5433     }
5434 }
5435 
5436 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5437                          float_status *s, uint32_t desc)
5438 {
5439     static const float32 coeff[16] = {
5440         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5441         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5442         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5443         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5444     };
5445     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5446     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5447     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5448     float32 *d = vd, *n = vn, *m = vm;
5449 
5450     for (i = 0; i < opr_sz; i++) {
5451         float32 mm = m[i];
5452         intptr_t xx = x;
5453         int flags = 0;
5454 
5455         if (float32_is_neg(mm)) {
5456             if (fpcr_ah) {
5457                 flags = float_muladd_negate_product;
5458             } else {
5459                 mm = float32_abs(mm);
5460             }
5461             xx += 8;
5462         }
5463         d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5464     }
5465 }
5466 
5467 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5468                          float_status *s, uint32_t desc)
5469 {
5470     static const float64 coeff[16] = {
5471         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5472         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5473         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5474         0x3de5d8408868552full, 0x0000000000000000ull,
5475         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5476         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5477         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5478         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5479     };
5480     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5481     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5482     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5483     float64 *d = vd, *n = vn, *m = vm;
5484 
5485     for (i = 0; i < opr_sz; i++) {
5486         float64 mm = m[i];
5487         intptr_t xx = x;
5488         int flags = 0;
5489 
5490         if (float64_is_neg(mm)) {
5491             if (fpcr_ah) {
5492                 flags = float_muladd_negate_product;
5493             } else {
5494                 mm = float64_abs(mm);
5495             }
5496             xx += 8;
5497         }
5498         d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5499     }
5500 }
5501 
5502 /*
5503  * FP Complex Add
5504  */
5505 
5506 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5507                          float_status *s, uint32_t desc)
5508 {
5509     intptr_t j, i = simd_oprsz(desc);
5510     uint64_t *g = vg;
5511     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5512     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5513 
5514     do {
5515         uint64_t pg = g[(i - 1) >> 6];
5516         do {
5517             float16 e0, e1, e2, e3;
5518 
5519             /* I holds the real index; J holds the imag index.  */
5520             j = i - sizeof(float16);
5521             i -= 2 * sizeof(float16);
5522 
5523             e0 = *(float16 *)(vn + H1_2(i));
5524             e1 = *(float16 *)(vm + H1_2(j));
5525             e2 = *(float16 *)(vn + H1_2(j));
5526             e3 = *(float16 *)(vm + H1_2(i));
5527 
5528             if (rot) {
5529                 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5530             } else {
5531                 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5532             }
5533 
5534             if (likely((pg >> (i & 63)) & 1)) {
5535                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5536             }
5537             if (likely((pg >> (j & 63)) & 1)) {
5538                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5539             }
5540         } while (i & 63);
5541     } while (i != 0);
5542 }
5543 
5544 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5545                          float_status *s, uint32_t desc)
5546 {
5547     intptr_t j, i = simd_oprsz(desc);
5548     uint64_t *g = vg;
5549     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5550     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5551 
5552     do {
5553         uint64_t pg = g[(i - 1) >> 6];
5554         do {
5555             float32 e0, e1, e2, e3;
5556 
5557             /* I holds the real index; J holds the imag index.  */
5558             j = i - sizeof(float32);
5559             i -= 2 * sizeof(float32);
5560 
5561             e0 = *(float32 *)(vn + H1_2(i));
5562             e1 = *(float32 *)(vm + H1_2(j));
5563             e2 = *(float32 *)(vn + H1_2(j));
5564             e3 = *(float32 *)(vm + H1_2(i));
5565 
5566             if (rot) {
5567                 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5568             } else {
5569                 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5570             }
5571 
5572             if (likely((pg >> (i & 63)) & 1)) {
5573                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5574             }
5575             if (likely((pg >> (j & 63)) & 1)) {
5576                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5577             }
5578         } while (i & 63);
5579     } while (i != 0);
5580 }
5581 
5582 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5583                          float_status *s, uint32_t desc)
5584 {
5585     intptr_t j, i = simd_oprsz(desc);
5586     uint64_t *g = vg;
5587     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5588     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5589 
5590     do {
5591         uint64_t pg = g[(i - 1) >> 6];
5592         do {
5593             float64 e0, e1, e2, e3;
5594 
5595             /* I holds the real index; J holds the imag index.  */
5596             j = i - sizeof(float64);
5597             i -= 2 * sizeof(float64);
5598 
5599             e0 = *(float64 *)(vn + H1_2(i));
5600             e1 = *(float64 *)(vm + H1_2(j));
5601             e2 = *(float64 *)(vn + H1_2(j));
5602             e3 = *(float64 *)(vm + H1_2(i));
5603 
5604             if (rot) {
5605                 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5606             } else {
5607                 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5608             }
5609 
5610             if (likely((pg >> (i & 63)) & 1)) {
5611                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5612             }
5613             if (likely((pg >> (j & 63)) & 1)) {
5614                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5615             }
5616         } while (i & 63);
5617     } while (i != 0);
5618 }
5619 
5620 /*
5621  * FP Complex Multiply
5622  */
5623 
5624 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5625                                void *vg, float_status *status, uint32_t desc)
5626 {
5627     intptr_t j, i = simd_oprsz(desc);
5628     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5629     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5630     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5631     uint32_t negf_real = flip ^ negf_imag;
5632     float16 negx_imag, negx_real;
5633     uint64_t *g = vg;
5634 
5635     /* With AH=0, use negx; with AH=1 use negf. */
5636     negx_real = (negf_real & ~fpcr_ah) << 15;
5637     negx_imag = (negf_imag & ~fpcr_ah) << 15;
5638     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5639     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5640 
5641     do {
5642         uint64_t pg = g[(i - 1) >> 6];
5643         do {
5644             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5645 
5646             /* I holds the real index; J holds the imag index.  */
5647             j = i - sizeof(float16);
5648             i -= 2 * sizeof(float16);
5649 
5650             nr = *(float16 *)(vn + H1_2(i));
5651             ni = *(float16 *)(vn + H1_2(j));
5652             mr = *(float16 *)(vm + H1_2(i));
5653             mi = *(float16 *)(vm + H1_2(j));
5654 
5655             e2 = (flip ? ni : nr);
5656             e1 = (flip ? mi : mr) ^ negx_real;
5657             e4 = e2;
5658             e3 = (flip ? mr : mi) ^ negx_imag;
5659 
5660             if (likely((pg >> (i & 63)) & 1)) {
5661                 d = *(float16 *)(va + H1_2(i));
5662                 d = float16_muladd(e2, e1, d, negf_real, status);
5663                 *(float16 *)(vd + H1_2(i)) = d;
5664             }
5665             if (likely((pg >> (j & 63)) & 1)) {
5666                 d = *(float16 *)(va + H1_2(j));
5667                 d = float16_muladd(e4, e3, d, negf_imag, status);
5668                 *(float16 *)(vd + H1_2(j)) = d;
5669             }
5670         } while (i & 63);
5671     } while (i != 0);
5672 }
5673 
5674 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5675                                void *vg, float_status *status, uint32_t desc)
5676 {
5677     intptr_t j, i = simd_oprsz(desc);
5678     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5679     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5680     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5681     uint32_t negf_real = flip ^ negf_imag;
5682     float32 negx_imag, negx_real;
5683     uint64_t *g = vg;
5684 
5685     /* With AH=0, use negx; with AH=1 use negf. */
5686     negx_real = (negf_real & ~fpcr_ah) << 31;
5687     negx_imag = (negf_imag & ~fpcr_ah) << 31;
5688     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5689     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5690 
5691     do {
5692         uint64_t pg = g[(i - 1) >> 6];
5693         do {
5694             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5695 
5696             /* I holds the real index; J holds the imag index.  */
5697             j = i - sizeof(float32);
5698             i -= 2 * sizeof(float32);
5699 
5700             nr = *(float32 *)(vn + H1_2(i));
5701             ni = *(float32 *)(vn + H1_2(j));
5702             mr = *(float32 *)(vm + H1_2(i));
5703             mi = *(float32 *)(vm + H1_2(j));
5704 
5705             e2 = (flip ? ni : nr);
5706             e1 = (flip ? mi : mr) ^ negx_real;
5707             e4 = e2;
5708             e3 = (flip ? mr : mi) ^ negx_imag;
5709 
5710             if (likely((pg >> (i & 63)) & 1)) {
5711                 d = *(float32 *)(va + H1_2(i));
5712                 d = float32_muladd(e2, e1, d, negf_real, status);
5713                 *(float32 *)(vd + H1_2(i)) = d;
5714             }
5715             if (likely((pg >> (j & 63)) & 1)) {
5716                 d = *(float32 *)(va + H1_2(j));
5717                 d = float32_muladd(e4, e3, d, negf_imag, status);
5718                 *(float32 *)(vd + H1_2(j)) = d;
5719             }
5720         } while (i & 63);
5721     } while (i != 0);
5722 }
5723 
5724 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5725                                void *vg, float_status *status, uint32_t desc)
5726 {
5727     intptr_t j, i = simd_oprsz(desc);
5728     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5729     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5730     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5731     uint32_t negf_real = flip ^ negf_imag;
5732     float64 negx_imag, negx_real;
5733     uint64_t *g = vg;
5734 
5735     /* With AH=0, use negx; with AH=1 use negf. */
5736     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5737     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5738     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5739     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5740 
5741     do {
5742         uint64_t pg = g[(i - 1) >> 6];
5743         do {
5744             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5745 
5746             /* I holds the real index; J holds the imag index.  */
5747             j = i - sizeof(float64);
5748             i -= 2 * sizeof(float64);
5749 
5750             nr = *(float64 *)(vn + H1_2(i));
5751             ni = *(float64 *)(vn + H1_2(j));
5752             mr = *(float64 *)(vm + H1_2(i));
5753             mi = *(float64 *)(vm + H1_2(j));
5754 
5755             e2 = (flip ? ni : nr);
5756             e1 = (flip ? mi : mr) ^ negx_real;
5757             e4 = e2;
5758             e3 = (flip ? mr : mi) ^ negx_imag;
5759 
5760             if (likely((pg >> (i & 63)) & 1)) {
5761                 d = *(float64 *)(va + H1_2(i));
5762                 d = float64_muladd(e2, e1, d, negf_real, status);
5763                 *(float64 *)(vd + H1_2(i)) = d;
5764             }
5765             if (likely((pg >> (j & 63)) & 1)) {
5766                 d = *(float64 *)(va + H1_2(j));
5767                 d = float64_muladd(e4, e3, d, negf_imag, status);
5768                 *(float64 *)(vd + H1_2(j)) = d;
5769             }
5770         } while (i & 63);
5771     } while (i != 0);
5772 }
5773 
5774 /*
5775  * Load contiguous data, protected by a governing predicate.
5776  */
5777 
5778 /*
5779  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5780  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5781  * element >= @reg_off, or @reg_max if there were no active elements at all.
5782  */
5783 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5784                                  intptr_t reg_max, int esz)
5785 {
5786     uint64_t pg_mask = pred_esz_masks[esz];
5787     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5788 
5789     /* In normal usage, the first element is active.  */
5790     if (likely(pg & 1)) {
5791         return reg_off;
5792     }
5793 
5794     if (pg == 0) {
5795         reg_off &= -64;
5796         do {
5797             reg_off += 64;
5798             if (unlikely(reg_off >= reg_max)) {
5799                 /* The entire predicate was false.  */
5800                 return reg_max;
5801             }
5802             pg = vg[reg_off >> 6] & pg_mask;
5803         } while (pg == 0);
5804     }
5805     reg_off += ctz64(pg);
5806 
5807     /* We should never see an out of range predicate bit set.  */
5808     tcg_debug_assert(reg_off < reg_max);
5809     return reg_off;
5810 }
5811 
5812 /*
5813  * Resolve the guest virtual address to info->host and info->flags.
5814  * If @nofault, return false if the page is invalid, otherwise
5815  * exit via page fault exception.
5816  */
5817 
5818 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5819                     target_ulong addr, int mem_off, MMUAccessType access_type,
5820                     int mmu_idx, uintptr_t retaddr)
5821 {
5822     int flags;
5823 
5824     addr += mem_off;
5825 
5826     /*
5827      * User-only currently always issues with TBI.  See the comment
5828      * above useronly_clean_ptr.  Usually we clean this top byte away
5829      * during translation, but we can't do that for e.g. vector + imm
5830      * addressing modes.
5831      *
5832      * We currently always enable TBI for user-only, and do not provide
5833      * a way to turn it off.  So clean the pointer unconditionally here,
5834      * rather than look it up here, or pass it down from above.
5835      */
5836     addr = useronly_clean_ptr(addr);
5837 
5838 #ifdef CONFIG_USER_ONLY
5839     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5840                                &info->host, retaddr);
5841 #else
5842     CPUTLBEntryFull *full;
5843     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5844                               &info->host, &full, retaddr);
5845 #endif
5846     info->flags = flags;
5847 
5848     if (flags & TLB_INVALID_MASK) {
5849         g_assert(nofault);
5850         return false;
5851     }
5852 
5853 #ifdef CONFIG_USER_ONLY
5854     memset(&info->attrs, 0, sizeof(info->attrs));
5855     /* Require both ANON and MTE; see allocation_tag_mem(). */
5856     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5857 #else
5858     info->attrs = full->attrs;
5859     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5860 #endif
5861 
5862     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5863     info->host -= mem_off;
5864     return true;
5865 }
5866 
5867 /*
5868  * Find first active element on each page, and a loose bound for the
5869  * final element on each page.  Identify any single element that spans
5870  * the page boundary.  Return true if there are any active elements.
5871  */
5872 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5873                             intptr_t reg_max, int esz, int msize)
5874 {
5875     const int esize = 1 << esz;
5876     const uint64_t pg_mask = pred_esz_masks[esz];
5877     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5878     intptr_t mem_off_last, mem_off_split;
5879     intptr_t page_split, elt_split;
5880     intptr_t i;
5881 
5882     /* Set all of the element indices to -1, and the TLB data to 0. */
5883     memset(info, -1, offsetof(SVEContLdSt, page));
5884     memset(info->page, 0, sizeof(info->page));
5885 
5886     /* Gross scan over the entire predicate to find bounds. */
5887     i = 0;
5888     do {
5889         uint64_t pg = vg[i] & pg_mask;
5890         if (pg) {
5891             reg_off_last = i * 64 + 63 - clz64(pg);
5892             if (reg_off_first < 0) {
5893                 reg_off_first = i * 64 + ctz64(pg);
5894             }
5895         }
5896     } while (++i * 64 < reg_max);
5897 
5898     if (unlikely(reg_off_first < 0)) {
5899         /* No active elements, no pages touched. */
5900         return false;
5901     }
5902     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5903 
5904     info->reg_off_first[0] = reg_off_first;
5905     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5906     mem_off_last = (reg_off_last >> esz) * msize;
5907 
5908     page_split = -(addr | TARGET_PAGE_MASK);
5909     if (likely(mem_off_last + msize <= page_split)) {
5910         /* The entire operation fits within a single page. */
5911         info->reg_off_last[0] = reg_off_last;
5912         return true;
5913     }
5914 
5915     info->page_split = page_split;
5916     elt_split = page_split / msize;
5917     reg_off_split = elt_split << esz;
5918     mem_off_split = elt_split * msize;
5919 
5920     /*
5921      * This is the last full element on the first page, but it is not
5922      * necessarily active.  If there is no full element, i.e. the first
5923      * active element is the one that's split, this value remains -1.
5924      * It is useful as iteration bounds.
5925      */
5926     if (elt_split != 0) {
5927         info->reg_off_last[0] = reg_off_split - esize;
5928     }
5929 
5930     /* Determine if an unaligned element spans the pages.  */
5931     if (page_split % msize != 0) {
5932         /* It is helpful to know if the split element is active. */
5933         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5934             info->reg_off_split = reg_off_split;
5935             info->mem_off_split = mem_off_split;
5936 
5937             if (reg_off_split == reg_off_last) {
5938                 /* The page crossing element is last. */
5939                 return true;
5940             }
5941         }
5942         reg_off_split += esize;
5943         mem_off_split += msize;
5944     }
5945 
5946     /*
5947      * We do want the first active element on the second page, because
5948      * this may affect the address reported in an exception.
5949      */
5950     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5951     tcg_debug_assert(reg_off_split <= reg_off_last);
5952     info->reg_off_first[1] = reg_off_split;
5953     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5954     info->reg_off_last[1] = reg_off_last;
5955     return true;
5956 }
5957 
5958 /*
5959  * Resolve the guest virtual addresses to info->page[].
5960  * Control the generation of page faults with @fault.  Return false if
5961  * there is no work to do, which can only happen with @fault == FAULT_NO.
5962  */
5963 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5964                          CPUARMState *env, target_ulong addr,
5965                          MMUAccessType access_type, uintptr_t retaddr)
5966 {
5967     int mmu_idx = arm_env_mmu_index(env);
5968     int mem_off = info->mem_off_first[0];
5969     bool nofault = fault == FAULT_NO;
5970     bool have_work = true;
5971 
5972     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5973                         access_type, mmu_idx, retaddr)) {
5974         /* No work to be done. */
5975         return false;
5976     }
5977 
5978     if (likely(info->page_split < 0)) {
5979         /* The entire operation was on the one page. */
5980         return true;
5981     }
5982 
5983     /*
5984      * If the second page is invalid, then we want the fault address to be
5985      * the first byte on that page which is accessed.
5986      */
5987     if (info->mem_off_split >= 0) {
5988         /*
5989          * There is an element split across the pages.  The fault address
5990          * should be the first byte of the second page.
5991          */
5992         mem_off = info->page_split;
5993         /*
5994          * If the split element is also the first active element
5995          * of the vector, then:  For first-fault we should continue
5996          * to generate faults for the second page.  For no-fault,
5997          * we have work only if the second page is valid.
5998          */
5999         if (info->mem_off_first[0] < info->mem_off_split) {
6000             nofault = FAULT_FIRST;
6001             have_work = false;
6002         }
6003     } else {
6004         /*
6005          * There is no element split across the pages.  The fault address
6006          * should be the first active element on the second page.
6007          */
6008         mem_off = info->mem_off_first[1];
6009         /*
6010          * There must have been one active element on the first page,
6011          * so we're out of first-fault territory.
6012          */
6013         nofault = fault != FAULT_ALL;
6014     }
6015 
6016     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
6017                                 access_type, mmu_idx, retaddr);
6018     return have_work;
6019 }
6020 
6021 #ifndef CONFIG_USER_ONLY
6022 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
6023                                uint64_t *vg, target_ulong addr,
6024                                int esize, int msize, int wp_access,
6025                                uintptr_t retaddr)
6026 {
6027     intptr_t mem_off, reg_off, reg_last;
6028     int flags0 = info->page[0].flags;
6029     int flags1 = info->page[1].flags;
6030 
6031     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
6032         return;
6033     }
6034 
6035     /* Indicate that watchpoints are handled. */
6036     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
6037     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
6038 
6039     if (flags0 & TLB_WATCHPOINT) {
6040         mem_off = info->mem_off_first[0];
6041         reg_off = info->reg_off_first[0];
6042         reg_last = info->reg_off_last[0];
6043 
6044         while (reg_off <= reg_last) {
6045             uint64_t pg = vg[reg_off >> 6];
6046             do {
6047                 if ((pg >> (reg_off & 63)) & 1) {
6048                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
6049                                          msize, info->page[0].attrs,
6050                                          wp_access, retaddr);
6051                 }
6052                 reg_off += esize;
6053                 mem_off += msize;
6054             } while (reg_off <= reg_last && (reg_off & 63));
6055         }
6056     }
6057 
6058     mem_off = info->mem_off_split;
6059     if (mem_off >= 0) {
6060         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
6061                              info->page[0].attrs, wp_access, retaddr);
6062     }
6063 
6064     mem_off = info->mem_off_first[1];
6065     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
6066         reg_off = info->reg_off_first[1];
6067         reg_last = info->reg_off_last[1];
6068 
6069         do {
6070             uint64_t pg = vg[reg_off >> 6];
6071             do {
6072                 if ((pg >> (reg_off & 63)) & 1) {
6073                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
6074                                          msize, info->page[1].attrs,
6075                                          wp_access, retaddr);
6076                 }
6077                 reg_off += esize;
6078                 mem_off += msize;
6079             } while (reg_off & 63);
6080         } while (reg_off <= reg_last);
6081     }
6082 }
6083 #endif
6084 
6085 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
6086                              uint64_t *vg, target_ulong addr, int esize,
6087                              int msize, uint32_t mtedesc, uintptr_t ra)
6088 {
6089     intptr_t mem_off, reg_off, reg_last;
6090 
6091     /* Process the page only if MemAttr == Tagged. */
6092     if (info->page[0].tagged) {
6093         mem_off = info->mem_off_first[0];
6094         reg_off = info->reg_off_first[0];
6095         reg_last = info->reg_off_split;
6096         if (reg_last < 0) {
6097             reg_last = info->reg_off_last[0];
6098         }
6099 
6100         do {
6101             uint64_t pg = vg[reg_off >> 6];
6102             do {
6103                 if ((pg >> (reg_off & 63)) & 1) {
6104                     mte_check(env, mtedesc, addr, ra);
6105                 }
6106                 reg_off += esize;
6107                 mem_off += msize;
6108             } while (reg_off <= reg_last && (reg_off & 63));
6109         } while (reg_off <= reg_last);
6110     }
6111 
6112     mem_off = info->mem_off_first[1];
6113     if (mem_off >= 0 && info->page[1].tagged) {
6114         reg_off = info->reg_off_first[1];
6115         reg_last = info->reg_off_last[1];
6116 
6117         do {
6118             uint64_t pg = vg[reg_off >> 6];
6119             do {
6120                 if ((pg >> (reg_off & 63)) & 1) {
6121                     mte_check(env, mtedesc, addr, ra);
6122                 }
6123                 reg_off += esize;
6124                 mem_off += msize;
6125             } while (reg_off & 63);
6126         } while (reg_off <= reg_last);
6127     }
6128 }
6129 
6130 /*
6131  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6132  */
6133 static inline QEMU_ALWAYS_INLINE
6134 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
6135                uint32_t desc, const uintptr_t retaddr,
6136                const int esz, const int msz, const int N, uint32_t mtedesc,
6137                sve_ldst1_host_fn *host_fn,
6138                sve_ldst1_tlb_fn *tlb_fn)
6139 {
6140     const unsigned rd = simd_data(desc);
6141     const intptr_t reg_max = simd_oprsz(desc);
6142     intptr_t reg_off, reg_last, mem_off;
6143     SVEContLdSt info;
6144     void *host;
6145     int flags, i;
6146 
6147     /* Find the active elements.  */
6148     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6149         /* The entire predicate was false; no load occurs.  */
6150         for (i = 0; i < N; ++i) {
6151             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
6152         }
6153         return;
6154     }
6155 
6156     /* Probe the page(s).  Exit with exception for any invalid page. */
6157     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
6158 
6159     /* Handle watchpoints for all active elements. */
6160     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6161                               BP_MEM_READ, retaddr);
6162 
6163     /*
6164      * Handle mte checks for all active elements.
6165      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6166      */
6167     if (mtedesc) {
6168         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6169                                 mtedesc, retaddr);
6170     }
6171 
6172     flags = info.page[0].flags | info.page[1].flags;
6173     if (unlikely(flags != 0)) {
6174         /*
6175          * At least one page includes MMIO.
6176          * Any bus operation can fail with cpu_transaction_failed,
6177          * which for ARM will raise SyncExternal.  Perform the load
6178          * into scratch memory to preserve register state until the end.
6179          */
6180         ARMVectorReg scratch[4] = { };
6181 
6182         mem_off = info.mem_off_first[0];
6183         reg_off = info.reg_off_first[0];
6184         reg_last = info.reg_off_last[1];
6185         if (reg_last < 0) {
6186             reg_last = info.reg_off_split;
6187             if (reg_last < 0) {
6188                 reg_last = info.reg_off_last[0];
6189             }
6190         }
6191 
6192         do {
6193             uint64_t pg = vg[reg_off >> 6];
6194             do {
6195                 if ((pg >> (reg_off & 63)) & 1) {
6196                     for (i = 0; i < N; ++i) {
6197                         tlb_fn(env, &scratch[i], reg_off,
6198                                addr + mem_off + (i << msz), retaddr);
6199                     }
6200                 }
6201                 reg_off += 1 << esz;
6202                 mem_off += N << msz;
6203             } while (reg_off & 63);
6204         } while (reg_off <= reg_last);
6205 
6206         for (i = 0; i < N; ++i) {
6207             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
6208         }
6209         return;
6210     }
6211 
6212     /* The entire operation is in RAM, on valid pages. */
6213 
6214     for (i = 0; i < N; ++i) {
6215         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
6216     }
6217 
6218     mem_off = info.mem_off_first[0];
6219     reg_off = info.reg_off_first[0];
6220     reg_last = info.reg_off_last[0];
6221     host = info.page[0].host;
6222 
6223     set_helper_retaddr(retaddr);
6224 
6225     while (reg_off <= reg_last) {
6226         uint64_t pg = vg[reg_off >> 6];
6227         do {
6228             if ((pg >> (reg_off & 63)) & 1) {
6229                 for (i = 0; i < N; ++i) {
6230                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6231                             host + mem_off + (i << msz));
6232                 }
6233             }
6234             reg_off += 1 << esz;
6235             mem_off += N << msz;
6236         } while (reg_off <= reg_last && (reg_off & 63));
6237     }
6238 
6239     clear_helper_retaddr();
6240 
6241     /*
6242      * Use the slow path to manage the cross-page misalignment.
6243      * But we know this is RAM and cannot trap.
6244      */
6245     mem_off = info.mem_off_split;
6246     if (unlikely(mem_off >= 0)) {
6247         reg_off = info.reg_off_split;
6248         for (i = 0; i < N; ++i) {
6249             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6250                    addr + mem_off + (i << msz), retaddr);
6251         }
6252     }
6253 
6254     mem_off = info.mem_off_first[1];
6255     if (unlikely(mem_off >= 0)) {
6256         reg_off = info.reg_off_first[1];
6257         reg_last = info.reg_off_last[1];
6258         host = info.page[1].host;
6259 
6260         set_helper_retaddr(retaddr);
6261 
6262         do {
6263             uint64_t pg = vg[reg_off >> 6];
6264             do {
6265                 if ((pg >> (reg_off & 63)) & 1) {
6266                     for (i = 0; i < N; ++i) {
6267                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6268                                 host + mem_off + (i << msz));
6269                     }
6270                 }
6271                 reg_off += 1 << esz;
6272                 mem_off += N << msz;
6273             } while (reg_off & 63);
6274         } while (reg_off <= reg_last);
6275 
6276         clear_helper_retaddr();
6277     }
6278 }
6279 
6280 static inline QEMU_ALWAYS_INLINE
6281 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6282                    uint32_t desc, const uintptr_t ra,
6283                    const int esz, const int msz, const int N,
6284                    sve_ldst1_host_fn *host_fn,
6285                    sve_ldst1_tlb_fn *tlb_fn)
6286 {
6287     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6288     int bit55 = extract64(addr, 55, 1);
6289 
6290     /* Remove mtedesc from the normal sve descriptor. */
6291     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6292 
6293     /* Perform gross MTE suppression early. */
6294     if (!tbi_check(mtedesc, bit55) ||
6295         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6296         mtedesc = 0;
6297     }
6298 
6299     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6300 }
6301 
6302 #define DO_LD1_1(NAME, ESZ)                                             \
6303 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
6304                             target_ulong addr, uint32_t desc)           \
6305 {                                                                       \
6306     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
6307               sve_##NAME##_host, sve_##NAME##_tlb);                     \
6308 }                                                                       \
6309 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
6310                                 target_ulong addr, uint32_t desc)       \
6311 {                                                                       \
6312     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
6313                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
6314 }
6315 
6316 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
6317 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
6318                                target_ulong addr, uint32_t desc)        \
6319 {                                                                       \
6320     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6321               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
6322 }                                                                       \
6323 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
6324                                target_ulong addr, uint32_t desc)        \
6325 {                                                                       \
6326     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6327               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
6328 }                                                                       \
6329 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
6330                                    target_ulong addr, uint32_t desc)    \
6331 {                                                                       \
6332     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6333                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
6334 }                                                                       \
6335 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
6336                                    target_ulong addr, uint32_t desc)    \
6337 {                                                                       \
6338     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6339                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
6340 }
6341 
6342 DO_LD1_1(ld1bb,  MO_8)
6343 DO_LD1_1(ld1bhu, MO_16)
6344 DO_LD1_1(ld1bhs, MO_16)
6345 DO_LD1_1(ld1bsu, MO_32)
6346 DO_LD1_1(ld1bss, MO_32)
6347 DO_LD1_1(ld1bdu, MO_64)
6348 DO_LD1_1(ld1bds, MO_64)
6349 
6350 DO_LD1_2(ld1hh,  MO_16, MO_16)
6351 DO_LD1_2(ld1hsu, MO_32, MO_16)
6352 DO_LD1_2(ld1hss, MO_32, MO_16)
6353 DO_LD1_2(ld1hdu, MO_64, MO_16)
6354 DO_LD1_2(ld1hds, MO_64, MO_16)
6355 
6356 DO_LD1_2(ld1ss,  MO_32, MO_32)
6357 DO_LD1_2(ld1sdu, MO_64, MO_32)
6358 DO_LD1_2(ld1sds, MO_64, MO_32)
6359 
6360 DO_LD1_2(ld1dd,  MO_64, MO_64)
6361 
6362 DO_LD1_2(ld1squ, MO_32, MO_128)
6363 DO_LD1_2(ld1dqu, MO_64, MO_128)
6364 
6365 #undef DO_LD1_1
6366 #undef DO_LD1_2
6367 
6368 #define DO_LDN_1(N)                                                     \
6369 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
6370                              target_ulong addr, uint32_t desc)          \
6371 {                                                                       \
6372     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
6373               sve_ld1bb_host, sve_ld1bb_tlb);                           \
6374 }                                                                       \
6375 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
6376                                  target_ulong addr, uint32_t desc)      \
6377 {                                                                       \
6378     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
6379                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
6380 }
6381 
6382 #define DO_LDN_2(N, SUFF, ESZ)                                          \
6383 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
6384                                     target_ulong addr, uint32_t desc)   \
6385 {                                                                       \
6386     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6387               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
6388 }                                                                       \
6389 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
6390                                     target_ulong addr, uint32_t desc)   \
6391 {                                                                       \
6392     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6393               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
6394 }                                                                       \
6395 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
6396                                         target_ulong addr, uint32_t desc) \
6397 {                                                                       \
6398     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6399                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
6400 }                                                                       \
6401 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
6402                                         target_ulong addr, uint32_t desc) \
6403 {                                                                       \
6404     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6405                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
6406 }
6407 
6408 DO_LDN_1(2)
6409 DO_LDN_1(3)
6410 DO_LDN_1(4)
6411 
6412 DO_LDN_2(2, hh, MO_16)
6413 DO_LDN_2(3, hh, MO_16)
6414 DO_LDN_2(4, hh, MO_16)
6415 
6416 DO_LDN_2(2, ss, MO_32)
6417 DO_LDN_2(3, ss, MO_32)
6418 DO_LDN_2(4, ss, MO_32)
6419 
6420 DO_LDN_2(2, dd, MO_64)
6421 DO_LDN_2(3, dd, MO_64)
6422 DO_LDN_2(4, dd, MO_64)
6423 
6424 DO_LDN_2(2, qq, MO_128)
6425 DO_LDN_2(3, qq, MO_128)
6426 DO_LDN_2(4, qq, MO_128)
6427 
6428 #undef DO_LDN_1
6429 #undef DO_LDN_2
6430 
6431 /*
6432  * Load contiguous data, first-fault and no-fault.
6433  *
6434  * For user-only, we control the race between page_check_range and
6435  * another thread's munmap by using set/clear_helper_retaddr.  Any
6436  * SEGV that occurs between those markers is assumed to be because
6437  * the guest page vanished.  Keep that block as small as possible
6438  * so that unrelated QEMU bugs are not blamed on the guest.
6439  */
6440 
6441 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
6442  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6443  * option, which leaves subsequent data unchanged.
6444  */
6445 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6446 {
6447     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6448 
6449     if (i & 63) {
6450         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6451         i = ROUND_UP(i, 64);
6452     }
6453     for (; i < oprsz; i += 64) {
6454         ffr[i / 64] = 0;
6455     }
6456 }
6457 
6458 /*
6459  * Common helper for all contiguous no-fault and first-fault loads.
6460  */
6461 static inline QEMU_ALWAYS_INLINE
6462 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6463                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6464                    const int esz, const int msz, const SVEContFault fault,
6465                    sve_ldst1_host_fn *host_fn,
6466                    sve_ldst1_tlb_fn *tlb_fn)
6467 {
6468     const unsigned rd = simd_data(desc);
6469     void *vd = &env->vfp.zregs[rd];
6470     const intptr_t reg_max = simd_oprsz(desc);
6471     intptr_t reg_off, mem_off, reg_last;
6472     SVEContLdSt info;
6473     int flags;
6474     void *host;
6475 
6476     /* Find the active elements.  */
6477     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6478         /* The entire predicate was false; no load occurs.  */
6479         memset(vd, 0, reg_max);
6480         return;
6481     }
6482     reg_off = info.reg_off_first[0];
6483 
6484     /* Probe the page(s). */
6485     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6486         /* Fault on first element. */
6487         tcg_debug_assert(fault == FAULT_NO);
6488         memset(vd, 0, reg_max);
6489         goto do_fault;
6490     }
6491 
6492     mem_off = info.mem_off_first[0];
6493     flags = info.page[0].flags;
6494 
6495     /*
6496      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6497      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6498      */
6499     if (!info.page[0].tagged) {
6500         mtedesc = 0;
6501     }
6502 
6503     if (fault == FAULT_FIRST) {
6504         /* Trapping mte check for the first-fault element.  */
6505         if (mtedesc) {
6506             mte_check(env, mtedesc, addr + mem_off, retaddr);
6507         }
6508 
6509         /*
6510          * Special handling of the first active element,
6511          * if it crosses a page boundary or is MMIO.
6512          */
6513         bool is_split = mem_off == info.mem_off_split;
6514         if (unlikely(flags != 0) || unlikely(is_split)) {
6515             /*
6516              * Use the slow path for cross-page handling.
6517              * Might trap for MMIO or watchpoints.
6518              */
6519             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6520 
6521             /* After any fault, zero the other elements. */
6522             swap_memzero(vd, reg_off);
6523             reg_off += 1 << esz;
6524             mem_off += 1 << msz;
6525             swap_memzero(vd + reg_off, reg_max - reg_off);
6526 
6527             if (is_split) {
6528                 goto second_page;
6529             }
6530         } else {
6531             memset(vd, 0, reg_max);
6532         }
6533     } else {
6534         memset(vd, 0, reg_max);
6535         if (unlikely(mem_off == info.mem_off_split)) {
6536             /* The first active element crosses a page boundary. */
6537             flags |= info.page[1].flags;
6538             if (unlikely(flags & TLB_MMIO)) {
6539                 /* Some page is MMIO, see below. */
6540                 goto do_fault;
6541             }
6542             if (unlikely(flags & TLB_WATCHPOINT) &&
6543                 (cpu_watchpoint_address_matches
6544                  (env_cpu(env), addr + mem_off, 1 << msz)
6545                  & BP_MEM_READ)) {
6546                 /* Watchpoint hit, see below. */
6547                 goto do_fault;
6548             }
6549             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6550                 goto do_fault;
6551             }
6552             /*
6553              * Use the slow path for cross-page handling.
6554              * This is RAM, without a watchpoint, and will not trap.
6555              */
6556             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6557             goto second_page;
6558         }
6559     }
6560 
6561     /*
6562      * From this point on, all memory operations are MemSingleNF.
6563      *
6564      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6565      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6566      *
6567      * Unfortuately we do not have access to the memory attributes from the
6568      * PTE to tell Device memory from Normal memory.  So we make a mostly
6569      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6570      * This gives the right answer for the common cases of "Normal memory,
6571      * backed by host RAM" and "Device memory, backed by MMIO".
6572      * The architecture allows us to suppress an NF load and return
6573      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6574      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6575      * get wrong is "Device memory, backed by host RAM", for which we
6576      * should return (UNKNOWN, FAULT) for but do not.
6577      *
6578      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6579      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6580      * architectural breakpoints the same.
6581      */
6582     if (unlikely(flags & TLB_MMIO)) {
6583         goto do_fault;
6584     }
6585 
6586     reg_last = info.reg_off_last[0];
6587     host = info.page[0].host;
6588 
6589     set_helper_retaddr(retaddr);
6590 
6591     do {
6592         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6593         do {
6594             if ((pg >> (reg_off & 63)) & 1) {
6595                 if (unlikely(flags & TLB_WATCHPOINT) &&
6596                     (cpu_watchpoint_address_matches
6597                      (env_cpu(env), addr + mem_off, 1 << msz)
6598                      & BP_MEM_READ)) {
6599                     clear_helper_retaddr();
6600                     goto do_fault;
6601                 }
6602                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6603                     clear_helper_retaddr();
6604                     goto do_fault;
6605                 }
6606                 host_fn(vd, reg_off, host + mem_off);
6607             }
6608             reg_off += 1 << esz;
6609             mem_off += 1 << msz;
6610         } while (reg_off <= reg_last && (reg_off & 63));
6611     } while (reg_off <= reg_last);
6612 
6613     clear_helper_retaddr();
6614 
6615     /*
6616      * MemSingleNF is allowed to fail for any reason.  We have special
6617      * code above to handle the first element crossing a page boundary.
6618      * As an implementation choice, decline to handle a cross-page element
6619      * in any other position.
6620      */
6621     reg_off = info.reg_off_split;
6622     if (reg_off >= 0) {
6623         goto do_fault;
6624     }
6625 
6626  second_page:
6627     reg_off = info.reg_off_first[1];
6628     if (likely(reg_off < 0)) {
6629         /* No active elements on the second page.  All done. */
6630         return;
6631     }
6632 
6633     /*
6634      * MemSingleNF is allowed to fail for any reason.  As an implementation
6635      * choice, decline to handle elements on the second page.  This should
6636      * be low frequency as the guest walks through memory -- the next
6637      * iteration of the guest's loop should be aligned on the page boundary,
6638      * and then all following iterations will stay aligned.
6639      */
6640 
6641  do_fault:
6642     record_fault(env, reg_off, reg_max);
6643 }
6644 
6645 static inline QEMU_ALWAYS_INLINE
6646 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6647                        uint32_t desc, const uintptr_t retaddr,
6648                        const int esz, const int msz, const SVEContFault fault,
6649                        sve_ldst1_host_fn *host_fn,
6650                        sve_ldst1_tlb_fn *tlb_fn)
6651 {
6652     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6653     int bit55 = extract64(addr, 55, 1);
6654 
6655     /* Remove mtedesc from the normal sve descriptor. */
6656     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6657 
6658     /* Perform gross MTE suppression early. */
6659     if (!tbi_check(mtedesc, bit55) ||
6660         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6661         mtedesc = 0;
6662     }
6663 
6664     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6665                   esz, msz, fault, host_fn, tlb_fn);
6666 }
6667 
6668 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6669 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6670                                  target_ulong addr, uint32_t desc)      \
6671 {                                                                       \
6672     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6673                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6674 }                                                                       \
6675 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6676                                  target_ulong addr, uint32_t desc)      \
6677 {                                                                       \
6678     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6679                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6680 }                                                                       \
6681 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6682                                      target_ulong addr, uint32_t desc)  \
6683 {                                                                       \
6684     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6685                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6686 }                                                                       \
6687 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6688                                      target_ulong addr, uint32_t desc)  \
6689 {                                                                       \
6690     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6691                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6692 }
6693 
6694 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6695 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6696                                     target_ulong addr, uint32_t desc)   \
6697 {                                                                       \
6698     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6699                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6700 }                                                                       \
6701 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6702                                     target_ulong addr, uint32_t desc)   \
6703 {                                                                       \
6704     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6705                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6706 }                                                                       \
6707 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6708                                     target_ulong addr, uint32_t desc)   \
6709 {                                                                       \
6710     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6711                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6712 }                                                                       \
6713 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6714                                     target_ulong addr, uint32_t desc)   \
6715 {                                                                       \
6716     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6717                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6718 }                                                                       \
6719 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6720                                         target_ulong addr, uint32_t desc) \
6721 {                                                                       \
6722     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6723                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6724 }                                                                       \
6725 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6726                                         target_ulong addr, uint32_t desc) \
6727 {                                                                       \
6728     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6729                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6730 }                                                                       \
6731 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6732                                         target_ulong addr, uint32_t desc) \
6733 {                                                                       \
6734     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6735                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6736 }                                                                       \
6737 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6738                                         target_ulong addr, uint32_t desc) \
6739 {                                                                       \
6740     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6741                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6742 }
6743 
6744 DO_LDFF1_LDNF1_1(bb,  MO_8)
6745 DO_LDFF1_LDNF1_1(bhu, MO_16)
6746 DO_LDFF1_LDNF1_1(bhs, MO_16)
6747 DO_LDFF1_LDNF1_1(bsu, MO_32)
6748 DO_LDFF1_LDNF1_1(bss, MO_32)
6749 DO_LDFF1_LDNF1_1(bdu, MO_64)
6750 DO_LDFF1_LDNF1_1(bds, MO_64)
6751 
6752 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6753 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6754 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6755 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6756 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6757 
6758 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6759 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6760 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6761 
6762 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6763 
6764 #undef DO_LDFF1_LDNF1_1
6765 #undef DO_LDFF1_LDNF1_2
6766 
6767 /*
6768  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6769  */
6770 
6771 static inline QEMU_ALWAYS_INLINE
6772 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6773                uint32_t desc, const uintptr_t retaddr,
6774                const int esz, const int msz, const int N, uint32_t mtedesc,
6775                sve_ldst1_host_fn *host_fn,
6776                sve_ldst1_tlb_fn *tlb_fn)
6777 {
6778     const unsigned rd = simd_data(desc);
6779     const intptr_t reg_max = simd_oprsz(desc);
6780     intptr_t reg_off, reg_last, mem_off;
6781     SVEContLdSt info;
6782     void *host;
6783     int i, flags;
6784 
6785     /* Find the active elements.  */
6786     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6787         /* The entire predicate was false; no store occurs.  */
6788         return;
6789     }
6790 
6791     /* Probe the page(s).  Exit with exception for any invalid page. */
6792     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6793 
6794     /* Handle watchpoints for all active elements. */
6795     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6796                               BP_MEM_WRITE, retaddr);
6797 
6798     /*
6799      * Handle mte checks for all active elements.
6800      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6801      */
6802     if (mtedesc) {
6803         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6804                                 mtedesc, retaddr);
6805     }
6806 
6807     flags = info.page[0].flags | info.page[1].flags;
6808     if (unlikely(flags != 0)) {
6809         /*
6810          * At least one page includes MMIO.
6811          * Any bus operation can fail with cpu_transaction_failed,
6812          * which for ARM will raise SyncExternal.  We cannot avoid
6813          * this fault and will leave with the store incomplete.
6814          */
6815         mem_off = info.mem_off_first[0];
6816         reg_off = info.reg_off_first[0];
6817         reg_last = info.reg_off_last[1];
6818         if (reg_last < 0) {
6819             reg_last = info.reg_off_split;
6820             if (reg_last < 0) {
6821                 reg_last = info.reg_off_last[0];
6822             }
6823         }
6824 
6825         do {
6826             uint64_t pg = vg[reg_off >> 6];
6827             do {
6828                 if ((pg >> (reg_off & 63)) & 1) {
6829                     for (i = 0; i < N; ++i) {
6830                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6831                                addr + mem_off + (i << msz), retaddr);
6832                     }
6833                 }
6834                 reg_off += 1 << esz;
6835                 mem_off += N << msz;
6836             } while (reg_off & 63);
6837         } while (reg_off <= reg_last);
6838         return;
6839     }
6840 
6841     mem_off = info.mem_off_first[0];
6842     reg_off = info.reg_off_first[0];
6843     reg_last = info.reg_off_last[0];
6844     host = info.page[0].host;
6845 
6846     set_helper_retaddr(retaddr);
6847 
6848     while (reg_off <= reg_last) {
6849         uint64_t pg = vg[reg_off >> 6];
6850         do {
6851             if ((pg >> (reg_off & 63)) & 1) {
6852                 for (i = 0; i < N; ++i) {
6853                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6854                             host + mem_off + (i << msz));
6855                 }
6856             }
6857             reg_off += 1 << esz;
6858             mem_off += N << msz;
6859         } while (reg_off <= reg_last && (reg_off & 63));
6860     }
6861 
6862     clear_helper_retaddr();
6863 
6864     /*
6865      * Use the slow path to manage the cross-page misalignment.
6866      * But we know this is RAM and cannot trap.
6867      */
6868     mem_off = info.mem_off_split;
6869     if (unlikely(mem_off >= 0)) {
6870         reg_off = info.reg_off_split;
6871         for (i = 0; i < N; ++i) {
6872             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6873                    addr + mem_off + (i << msz), retaddr);
6874         }
6875     }
6876 
6877     mem_off = info.mem_off_first[1];
6878     if (unlikely(mem_off >= 0)) {
6879         reg_off = info.reg_off_first[1];
6880         reg_last = info.reg_off_last[1];
6881         host = info.page[1].host;
6882 
6883         set_helper_retaddr(retaddr);
6884 
6885         do {
6886             uint64_t pg = vg[reg_off >> 6];
6887             do {
6888                 if ((pg >> (reg_off & 63)) & 1) {
6889                     for (i = 0; i < N; ++i) {
6890                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6891                                 host + mem_off + (i << msz));
6892                     }
6893                 }
6894                 reg_off += 1 << esz;
6895                 mem_off += N << msz;
6896             } while (reg_off & 63);
6897         } while (reg_off <= reg_last);
6898 
6899         clear_helper_retaddr();
6900     }
6901 }
6902 
6903 static inline QEMU_ALWAYS_INLINE
6904 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6905                    uint32_t desc, const uintptr_t ra,
6906                    const int esz, const int msz, const int N,
6907                    sve_ldst1_host_fn *host_fn,
6908                    sve_ldst1_tlb_fn *tlb_fn)
6909 {
6910     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6911     int bit55 = extract64(addr, 55, 1);
6912 
6913     /* Remove mtedesc from the normal sve descriptor. */
6914     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6915 
6916     /* Perform gross MTE suppression early. */
6917     if (!tbi_check(mtedesc, bit55) ||
6918         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6919         mtedesc = 0;
6920     }
6921 
6922     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6923 }
6924 
6925 #define DO_STN_1(N, NAME, ESZ)                                          \
6926 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6927                                  target_ulong addr, uint32_t desc)      \
6928 {                                                                       \
6929     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6930               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6931 }                                                                       \
6932 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6933                                      target_ulong addr, uint32_t desc)  \
6934 {                                                                       \
6935     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6936                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6937 }
6938 
6939 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6940 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6941                                     target_ulong addr, uint32_t desc)   \
6942 {                                                                       \
6943     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6944               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6945 }                                                                       \
6946 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6947                                     target_ulong addr, uint32_t desc)   \
6948 {                                                                       \
6949     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6950               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6951 }                                                                       \
6952 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6953                                         target_ulong addr, uint32_t desc) \
6954 {                                                                       \
6955     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6956                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6957 }                                                                       \
6958 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6959                                         target_ulong addr, uint32_t desc) \
6960 {                                                                       \
6961     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6962                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6963 }
6964 
6965 DO_STN_1(1, bb, MO_8)
6966 DO_STN_1(1, bh, MO_16)
6967 DO_STN_1(1, bs, MO_32)
6968 DO_STN_1(1, bd, MO_64)
6969 DO_STN_1(2, bb, MO_8)
6970 DO_STN_1(3, bb, MO_8)
6971 DO_STN_1(4, bb, MO_8)
6972 
6973 DO_STN_2(1, hh, MO_16, MO_16)
6974 DO_STN_2(1, hs, MO_32, MO_16)
6975 DO_STN_2(1, hd, MO_64, MO_16)
6976 DO_STN_2(2, hh, MO_16, MO_16)
6977 DO_STN_2(3, hh, MO_16, MO_16)
6978 DO_STN_2(4, hh, MO_16, MO_16)
6979 
6980 DO_STN_2(1, ss, MO_32, MO_32)
6981 DO_STN_2(1, sd, MO_64, MO_32)
6982 DO_STN_2(2, ss, MO_32, MO_32)
6983 DO_STN_2(3, ss, MO_32, MO_32)
6984 DO_STN_2(4, ss, MO_32, MO_32)
6985 
6986 DO_STN_2(1, dd, MO_64, MO_64)
6987 DO_STN_2(2, dd, MO_64, MO_64)
6988 DO_STN_2(3, dd, MO_64, MO_64)
6989 DO_STN_2(4, dd, MO_64, MO_64)
6990 
6991 DO_STN_2(1, sq, MO_128, MO_32)
6992 DO_STN_2(1, dq, MO_128, MO_64)
6993 
6994 DO_STN_2(2, qq, MO_128, MO_128)
6995 DO_STN_2(3, qq, MO_128, MO_128)
6996 DO_STN_2(4, qq, MO_128, MO_128)
6997 
6998 #undef DO_STN_1
6999 #undef DO_STN_2
7000 
7001 /*
7002  * Loads with a vector index.
7003  */
7004 
7005 /*
7006  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
7007  */
7008 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
7009 
7010 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
7011 {
7012     return *(uint32_t *)(reg + H1_4(reg_ofs));
7013 }
7014 
7015 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
7016 {
7017     return *(int32_t *)(reg + H1_4(reg_ofs));
7018 }
7019 
7020 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
7021 {
7022     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
7023 }
7024 
7025 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
7026 {
7027     return (int32_t)*(uint64_t *)(reg + reg_ofs);
7028 }
7029 
7030 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
7031 {
7032     return *(uint64_t *)(reg + reg_ofs);
7033 }
7034 
7035 static inline QEMU_ALWAYS_INLINE
7036 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7037                target_ulong base, uint32_t desc, uintptr_t retaddr,
7038                uint32_t mtedesc, int esize, int msize,
7039                zreg_off_fn *off_fn,
7040                sve_ldst1_host_fn *host_fn,
7041                sve_ldst1_tlb_fn *tlb_fn)
7042 {
7043     const int mmu_idx = arm_env_mmu_index(env);
7044     const intptr_t reg_max = simd_oprsz(desc);
7045     const int scale = simd_data(desc);
7046     ARMVectorReg scratch;
7047     intptr_t reg_off;
7048     SVEHostPage info, info2;
7049 
7050     memset(&scratch, 0, reg_max);
7051     reg_off = 0;
7052     do {
7053         uint64_t pg = vg[reg_off >> 6];
7054         do {
7055             if (likely(pg & 1)) {
7056                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7057                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7058 
7059                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
7060                                mmu_idx, retaddr);
7061 
7062                 if (likely(in_page >= msize)) {
7063                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
7064                         cpu_check_watchpoint(env_cpu(env), addr, msize,
7065                                              info.attrs, BP_MEM_READ, retaddr);
7066                     }
7067                     if (mtedesc && info.tagged) {
7068                         mte_check(env, mtedesc, addr, retaddr);
7069                     }
7070                     if (unlikely(info.flags & TLB_MMIO)) {
7071                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
7072                     } else {
7073                         set_helper_retaddr(retaddr);
7074                         host_fn(&scratch, reg_off, info.host);
7075                         clear_helper_retaddr();
7076                     }
7077                 } else {
7078                     /* Element crosses the page boundary. */
7079                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7080                                    MMU_DATA_LOAD, mmu_idx, retaddr);
7081                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
7082                         cpu_check_watchpoint(env_cpu(env), addr,
7083                                              msize, info.attrs,
7084                                              BP_MEM_READ, retaddr);
7085                     }
7086                     if (mtedesc && info.tagged) {
7087                         mte_check(env, mtedesc, addr, retaddr);
7088                     }
7089                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
7090                 }
7091             }
7092             reg_off += esize;
7093             pg >>= esize;
7094         } while (reg_off & 63);
7095     } while (reg_off < reg_max);
7096 
7097     /* Wait until all exceptions have been raised to write back.  */
7098     memcpy(vd, &scratch, reg_max);
7099 }
7100 
7101 static inline QEMU_ALWAYS_INLINE
7102 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7103                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7104                    int esize, int msize, zreg_off_fn *off_fn,
7105                    sve_ldst1_host_fn *host_fn,
7106                    sve_ldst1_tlb_fn *tlb_fn)
7107 {
7108     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7109     /* Remove mtedesc from the normal sve descriptor. */
7110     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7111 
7112     /*
7113      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7114      * offset base entirely over the address space hole to change the
7115      * pointer tag, or change the bit55 selector.  So we could here
7116      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7117      */
7118     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7119               esize, msize, off_fn, host_fn, tlb_fn);
7120 }
7121 
7122 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
7123 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
7124                                  void *vm, target_ulong base, uint32_t desc) \
7125 {                                                                            \
7126     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
7127               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
7128 }                                                                            \
7129 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7130      void *vm, target_ulong base, uint32_t desc)                             \
7131 {                                                                            \
7132     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
7133                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
7134 }
7135 
7136 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
7137 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
7138                                  void *vm, target_ulong base, uint32_t desc) \
7139 {                                                                            \
7140     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
7141               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
7142 }                                                                            \
7143 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7144     void *vm, target_ulong base, uint32_t desc)                              \
7145 {                                                                            \
7146     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
7147                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
7148 }
7149 
7150 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
7151 DO_LD1_ZPZ_S(bsu, zss, MO_8)
7152 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
7153 DO_LD1_ZPZ_D(bdu, zss, MO_8)
7154 DO_LD1_ZPZ_D(bdu, zd, MO_8)
7155 
7156 DO_LD1_ZPZ_S(bss, zsu, MO_8)
7157 DO_LD1_ZPZ_S(bss, zss, MO_8)
7158 DO_LD1_ZPZ_D(bds, zsu, MO_8)
7159 DO_LD1_ZPZ_D(bds, zss, MO_8)
7160 DO_LD1_ZPZ_D(bds, zd, MO_8)
7161 
7162 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
7163 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
7164 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
7165 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
7166 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
7167 
7168 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
7169 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
7170 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
7171 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
7172 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
7173 
7174 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
7175 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
7176 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
7177 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
7178 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
7179 
7180 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
7181 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
7182 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
7183 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
7184 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
7185 
7186 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
7187 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
7188 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
7189 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
7190 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
7191 
7192 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
7193 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
7194 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
7195 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
7196 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
7197 
7198 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
7199 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
7200 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
7201 
7202 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
7203 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
7204 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
7205 
7206 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
7207 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
7208 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
7209 
7210 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
7211 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
7212 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
7213 
7214 DO_LD1_ZPZ_D(qq_le, zd, MO_128)
7215 DO_LD1_ZPZ_D(qq_be, zd, MO_128)
7216 
7217 #undef DO_LD1_ZPZ_S
7218 #undef DO_LD1_ZPZ_D
7219 
7220 /* First fault loads with a vector index.  */
7221 
7222 /*
7223  * Common helpers for all gather first-faulting loads.
7224  */
7225 
7226 static inline QEMU_ALWAYS_INLINE
7227 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7228                  target_ulong base, uint32_t desc, uintptr_t retaddr,
7229                  uint32_t mtedesc, const int esz, const int msz,
7230                  zreg_off_fn *off_fn,
7231                  sve_ldst1_host_fn *host_fn,
7232                  sve_ldst1_tlb_fn *tlb_fn)
7233 {
7234     const int mmu_idx = arm_env_mmu_index(env);
7235     const intptr_t reg_max = simd_oprsz(desc);
7236     const int scale = simd_data(desc);
7237     const int esize = 1 << esz;
7238     const int msize = 1 << msz;
7239     intptr_t reg_off;
7240     SVEHostPage info;
7241     target_ulong addr, in_page;
7242     ARMVectorReg scratch;
7243 
7244     /* Skip to the first true predicate.  */
7245     reg_off = find_next_active(vg, 0, reg_max, esz);
7246     if (unlikely(reg_off >= reg_max)) {
7247         /* The entire predicate was false; no load occurs.  */
7248         memset(vd, 0, reg_max);
7249         return;
7250     }
7251 
7252     /* Protect against overlap between vd and vm. */
7253     if (unlikely(vd == vm)) {
7254         vm = memcpy(&scratch, vm, reg_max);
7255     }
7256 
7257     /*
7258      * Probe the first element, allowing faults.
7259      */
7260     addr = base + (off_fn(vm, reg_off) << scale);
7261     if (mtedesc) {
7262         mte_check(env, mtedesc, addr, retaddr);
7263     }
7264     tlb_fn(env, vd, reg_off, addr, retaddr);
7265 
7266     /* After any fault, zero the other elements. */
7267     swap_memzero(vd, reg_off);
7268     reg_off += esize;
7269     swap_memzero(vd + reg_off, reg_max - reg_off);
7270 
7271     /*
7272      * Probe the remaining elements, not allowing faults.
7273      */
7274     while (reg_off < reg_max) {
7275         uint64_t pg = vg[reg_off >> 6];
7276         do {
7277             if (likely((pg >> (reg_off & 63)) & 1)) {
7278                 addr = base + (off_fn(vm, reg_off) << scale);
7279                 in_page = -(addr | TARGET_PAGE_MASK);
7280 
7281                 if (unlikely(in_page < msize)) {
7282                     /* Stop if the element crosses a page boundary. */
7283                     goto fault;
7284                 }
7285 
7286                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
7287                                mmu_idx, retaddr);
7288                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
7289                     goto fault;
7290                 }
7291                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
7292                     (cpu_watchpoint_address_matches
7293                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7294                     goto fault;
7295                 }
7296                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7297                     goto fault;
7298                 }
7299 
7300                 set_helper_retaddr(retaddr);
7301                 host_fn(vd, reg_off, info.host);
7302                 clear_helper_retaddr();
7303             }
7304             reg_off += esize;
7305         } while (reg_off & 63);
7306     }
7307     return;
7308 
7309  fault:
7310     record_fault(env, reg_off, reg_max);
7311 }
7312 
7313 static inline QEMU_ALWAYS_INLINE
7314 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7315                      target_ulong base, uint32_t desc, uintptr_t retaddr,
7316                      const int esz, const int msz,
7317                      zreg_off_fn *off_fn,
7318                      sve_ldst1_host_fn *host_fn,
7319                      sve_ldst1_tlb_fn *tlb_fn)
7320 {
7321     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7322     /* Remove mtedesc from the normal sve descriptor. */
7323     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7324 
7325     /*
7326      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7327      * offset base entirely over the address space hole to change the
7328      * pointer tag, or change the bit55 selector.  So we could here
7329      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7330      */
7331     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7332                 esz, msz, off_fn, host_fn, tlb_fn);
7333 }
7334 
7335 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
7336 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7337     (CPUARMState *env, void *vd, void *vg,                              \
7338      void *vm, target_ulong base, uint32_t desc)                        \
7339 {                                                                       \
7340     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
7341                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7342 }                                                                       \
7343 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7344     (CPUARMState *env, void *vd, void *vg,                              \
7345      void *vm, target_ulong base, uint32_t desc)                        \
7346 {                                                                       \
7347     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
7348                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7349 }
7350 
7351 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
7352 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7353     (CPUARMState *env, void *vd, void *vg,                              \
7354      void *vm, target_ulong base, uint32_t desc)                        \
7355 {                                                                       \
7356     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
7357                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7358 }                                                                       \
7359 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7360     (CPUARMState *env, void *vd, void *vg,                              \
7361      void *vm, target_ulong base, uint32_t desc)                        \
7362 {                                                                       \
7363     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
7364                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7365 }
7366 
7367 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7368 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7369 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7370 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7371 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7372 
7373 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7374 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7375 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7376 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7377 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7378 
7379 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7380 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7381 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7382 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7383 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7384 
7385 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7386 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7387 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7388 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7389 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7390 
7391 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7392 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7393 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7394 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7395 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7396 
7397 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7398 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7399 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7400 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7401 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7402 
7403 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
7404 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
7405 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7406 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7407 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7408 
7409 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
7410 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
7411 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7412 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7413 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7414 
7415 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7416 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7417 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7418 
7419 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7420 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7421 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7422 
7423 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7424 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7425 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7426 
7427 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7428 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7429 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7430 
7431 /* Stores with a vector index.  */
7432 
7433 static inline QEMU_ALWAYS_INLINE
7434 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7435                target_ulong base, uint32_t desc, uintptr_t retaddr,
7436                uint32_t mtedesc, int esize, int msize,
7437                zreg_off_fn *off_fn,
7438                sve_ldst1_host_fn *host_fn,
7439                sve_ldst1_tlb_fn *tlb_fn)
7440 {
7441     const int mmu_idx = arm_env_mmu_index(env);
7442     const intptr_t reg_max = simd_oprsz(desc);
7443     const int scale = simd_data(desc);
7444     void *host[ARM_MAX_VQ * 4];
7445     intptr_t reg_off, i;
7446     SVEHostPage info, info2;
7447 
7448     /*
7449      * Probe all of the elements for host addresses and flags.
7450      */
7451     i = reg_off = 0;
7452     do {
7453         uint64_t pg = vg[reg_off >> 6];
7454         do {
7455             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7456             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7457 
7458             host[i] = NULL;
7459             if (likely((pg >> (reg_off & 63)) & 1)) {
7460                 if (likely(in_page >= msize)) {
7461                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7462                                    mmu_idx, retaddr);
7463                     if (!(info.flags & TLB_MMIO)) {
7464                         host[i] = info.host;
7465                     }
7466                 } else {
7467                     /*
7468                      * Element crosses the page boundary.
7469                      * Probe both pages, but do not record the host address,
7470                      * so that we use the slow path.
7471                      */
7472                     sve_probe_page(&info, false, env, addr, 0,
7473                                    MMU_DATA_STORE, mmu_idx, retaddr);
7474                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7475                                    MMU_DATA_STORE, mmu_idx, retaddr);
7476                     info.flags |= info2.flags;
7477                 }
7478 
7479                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7480                     cpu_check_watchpoint(env_cpu(env), addr, msize,
7481                                          info.attrs, BP_MEM_WRITE, retaddr);
7482                 }
7483 
7484                 if (mtedesc && info.tagged) {
7485                     mte_check(env, mtedesc, addr, retaddr);
7486                 }
7487             }
7488             i += 1;
7489             reg_off += esize;
7490         } while (reg_off & 63);
7491     } while (reg_off < reg_max);
7492 
7493     /*
7494      * Now that we have recognized all exceptions except SyncExternal
7495      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7496      *
7497      * Note for the common case of an element in RAM, not crossing a page
7498      * boundary, we have stored the host address in host[].  This doubles
7499      * as a first-level check against the predicate, since only enabled
7500      * elements have non-null host addresses.
7501      */
7502     i = reg_off = 0;
7503     do {
7504         void *h = host[i];
7505         if (likely(h != NULL)) {
7506             set_helper_retaddr(retaddr);
7507             host_fn(vd, reg_off, h);
7508             clear_helper_retaddr();
7509         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7510             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7511             tlb_fn(env, vd, reg_off, addr, retaddr);
7512         }
7513         i += 1;
7514         reg_off += esize;
7515     } while (reg_off < reg_max);
7516 }
7517 
7518 static inline QEMU_ALWAYS_INLINE
7519 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7520                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7521                    int esize, int msize, zreg_off_fn *off_fn,
7522                    sve_ldst1_host_fn *host_fn,
7523                    sve_ldst1_tlb_fn *tlb_fn)
7524 {
7525     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7526     /* Remove mtedesc from the normal sve descriptor. */
7527     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7528 
7529     /*
7530      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7531      * offset base entirely over the address space hole to change the
7532      * pointer tag, or change the bit55 selector.  So we could here
7533      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7534      */
7535     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7536               esize, msize, off_fn, host_fn, tlb_fn);
7537 }
7538 
7539 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7540 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7541                                  void *vm, target_ulong base, uint32_t desc) \
7542 {                                                                       \
7543     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7544               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7545 }                                                                       \
7546 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7547     void *vm, target_ulong base, uint32_t desc)                         \
7548 {                                                                       \
7549     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7550                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7551 }
7552 
7553 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7554 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7555                                  void *vm, target_ulong base, uint32_t desc) \
7556 {                                                                       \
7557     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7558               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7559 }                                                                       \
7560 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7561     void *vm, target_ulong base, uint32_t desc)                         \
7562 {                                                                       \
7563     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7564                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7565 }
7566 
7567 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7568 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7569 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7570 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7571 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7572 
7573 DO_ST1_ZPZ_S(bs, zss, MO_8)
7574 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7575 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7576 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7577 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7578 
7579 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7580 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7581 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7582 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7583 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7584 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7585 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7586 
7587 DO_ST1_ZPZ_D(bd, zss, MO_8)
7588 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7589 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7590 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7591 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7592 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7593 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7594 
7595 DO_ST1_ZPZ_D(bd, zd, MO_8)
7596 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7597 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7598 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7599 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7600 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7601 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7602 
7603 DO_ST1_ZPZ_D(qq_le, zd, MO_128)
7604 DO_ST1_ZPZ_D(qq_be, zd, MO_128)
7605 
7606 #undef DO_ST1_ZPZ_S
7607 #undef DO_ST1_ZPZ_D
7608 
7609 /*
7610  * SVE2.1 consecutive register load/store
7611  */
7612 
7613 static unsigned sve2p1_cont_ldst_elements(SVEContLdSt *info, vaddr addr,
7614                                           uint32_t png, intptr_t reg_max,
7615                                           int N, int v_esz)
7616 {
7617     const int esize = 1 << v_esz;
7618     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
7619     DecodeCounter p = decode_counter(png, reg_max, v_esz);
7620     unsigned b_count = p.count << v_esz;
7621     unsigned b_stride = 1 << (v_esz + p.lg2_stride);
7622     intptr_t page_split;
7623 
7624     /* Set all of the element indices to -1, and the TLB data to 0. */
7625     memset(info, -1, offsetof(SVEContLdSt, page));
7626     memset(info->page, 0, sizeof(info->page));
7627 
7628     if (p.invert) {
7629         if (b_count >= reg_max * N) {
7630             return 0;
7631         }
7632         reg_off_first = b_count;
7633         reg_off_last = reg_max * N - b_stride;
7634     } else {
7635         if (b_count == 0) {
7636             return 0;
7637         }
7638         reg_off_first = 0;
7639         reg_off_last = MIN(b_count - esize, reg_max * N - b_stride);
7640     }
7641 
7642     info->reg_off_first[0] = reg_off_first;
7643     info->mem_off_first[0] = reg_off_first;
7644 
7645     page_split = -(addr | TARGET_PAGE_MASK);
7646     if (reg_off_last + esize <= page_split || reg_off_first >= page_split) {
7647         /* The entire operation fits within a single page. */
7648         info->reg_off_last[0] = reg_off_last;
7649         return b_stride;
7650     }
7651 
7652     info->page_split = page_split;
7653     reg_off_split = ROUND_DOWN(page_split, esize);
7654 
7655     /*
7656      * This is the last full element on the first page, but it is not
7657      * necessarily active.  If there is no full element, i.e. the first
7658      * active element is the one that's split, this value remains -1.
7659      * It is useful as iteration bounds.
7660      */
7661     if (reg_off_split != 0) {
7662         info->reg_off_last[0] = ROUND_DOWN(reg_off_split - esize, b_stride);
7663     }
7664 
7665     /* Determine if an unaligned element spans the pages.  */
7666     if (page_split & (esize - 1)) {
7667         /* It is helpful to know if the split element is active. */
7668         if ((reg_off_split & (b_stride - 1)) == 0) {
7669             info->reg_off_split = reg_off_split;
7670             info->mem_off_split = reg_off_split;
7671         }
7672         reg_off_split += esize;
7673     }
7674 
7675     /*
7676      * We do want the first active element on the second page, because
7677      * this may affect the address reported in an exception.
7678      */
7679     reg_off_split = ROUND_UP(reg_off_split, b_stride);
7680     if (reg_off_split <= reg_off_last) {
7681         info->reg_off_first[1] = reg_off_split;
7682         info->mem_off_first[1] = reg_off_split;
7683         info->reg_off_last[1] = reg_off_last;
7684     }
7685     return b_stride;
7686 }
7687 
7688 static void sve2p1_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
7689                                          target_ulong addr, unsigned estride,
7690                                          int esize, int wp_access, uintptr_t ra)
7691 {
7692 #ifndef CONFIG_USER_ONLY
7693     intptr_t count_off, count_last;
7694     int flags0 = info->page[0].flags;
7695     int flags1 = info->page[1].flags;
7696 
7697     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
7698         return;
7699     }
7700 
7701     /* Indicate that watchpoints are handled. */
7702     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
7703     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
7704 
7705     if (flags0 & TLB_WATCHPOINT) {
7706         count_off = info->reg_off_first[0];
7707         count_last = info->reg_off_split;
7708         if (count_last < 0) {
7709             count_last = info->reg_off_last[0];
7710         }
7711         do {
7712             cpu_check_watchpoint(env_cpu(env), addr + count_off,
7713                                  esize, info->page[0].attrs, wp_access, ra);
7714             count_off += estride;
7715         } while (count_off <= count_last);
7716     }
7717 
7718     count_off = info->reg_off_first[1];
7719     if ((flags1 & TLB_WATCHPOINT) && count_off >= 0) {
7720         count_last = info->reg_off_last[1];
7721         do {
7722             cpu_check_watchpoint(env_cpu(env), addr + count_off,
7723                                  esize, info->page[1].attrs,
7724                                  wp_access, ra);
7725             count_off += estride;
7726         } while (count_off <= count_last);
7727     }
7728 #endif
7729 }
7730 
7731 static void sve2p1_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
7732                                        target_ulong addr, unsigned estride,
7733                                        int esize, uint32_t mtedesc,
7734                                        uintptr_t ra)
7735 {
7736     intptr_t count_off, count_last;
7737 
7738     /*
7739      * TODO: estride is always a small power of two, <= 8.
7740      * Manipulate the stride within the loops such that
7741      *   - first iteration hits addr + off, as required,
7742      *   - second iteration hits ALIGN_UP(addr, 16),
7743      *   - other iterations advance addr by 16.
7744      * This will minimize the probing to once per MTE granule.
7745      */
7746 
7747     /* Process the page only if MemAttr == Tagged. */
7748     if (info->page[0].tagged) {
7749         count_off = info->reg_off_first[0];
7750         count_last = info->reg_off_split;
7751         if (count_last < 0) {
7752             count_last = info->reg_off_last[0];
7753         }
7754 
7755         do {
7756             mte_check(env, mtedesc, addr + count_off, ra);
7757             count_off += estride;
7758         } while (count_off <= count_last);
7759     }
7760 
7761     count_off = info->reg_off_first[1];
7762     if (count_off >= 0 && info->page[1].tagged) {
7763         count_last = info->reg_off_last[1];
7764         do {
7765             mte_check(env, mtedesc, addr + count_off, ra);
7766             count_off += estride;
7767         } while (count_off <= count_last);
7768     }
7769 }
7770 
7771 static inline QEMU_ALWAYS_INLINE
7772 void sve2p1_ld1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
7773                   uint32_t png, uint32_t desc,
7774                   const uintptr_t ra, const MemOp esz,
7775                   sve_ldst1_host_fn *host_fn,
7776                   sve_ldst1_tlb_fn *tlb_fn)
7777 {
7778     const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
7779     const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
7780     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7781     const intptr_t reg_max = simd_oprsz(desc);
7782     const unsigned esize = 1 << esz;
7783     intptr_t count_off, count_last;
7784     intptr_t reg_off, reg_last, reg_n;
7785     SVEContLdSt info;
7786     unsigned estride, flags;
7787     void *host;
7788 
7789     estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
7790     if (estride == 0) {
7791         /* The entire predicate was false; no load occurs.  */
7792         for (unsigned n = 0; n < N; n++) {
7793             memset(zd + n * rstride, 0, reg_max);
7794         }
7795         return;
7796     }
7797 
7798     /* Probe the page(s).  Exit with exception for any invalid page. */
7799     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
7800 
7801     /* Handle watchpoints for all active elements. */
7802     sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
7803                                  esize, BP_MEM_READ, ra);
7804 
7805     /*
7806      * Handle mte checks for all active elements.
7807      * Since TBI must be set for MTE, !mtedesc => !mte_active.
7808      */
7809     if (mtedesc) {
7810         sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
7811                                    esize, mtedesc, ra);
7812     }
7813 
7814     flags = info.page[0].flags | info.page[1].flags;
7815     if (unlikely(flags != 0)) {
7816         /*
7817          * At least one page includes MMIO.
7818          * Any bus operation can fail with cpu_transaction_failed,
7819          * which for ARM will raise SyncExternal.  Perform the load
7820          * into scratch memory to preserve register state until the end.
7821          */
7822         ARMVectorReg scratch[4] = { };
7823 
7824         count_off = info.reg_off_first[0];
7825         count_last = info.reg_off_last[1];
7826         if (count_last < 0) {
7827             count_last = info.reg_off_split;
7828             if (count_last < 0) {
7829                 count_last = info.reg_off_last[0];
7830             }
7831         }
7832         reg_off = count_off % reg_max;
7833         reg_n = count_off / reg_max;
7834 
7835         do {
7836             reg_last = MIN(count_last - count_off, reg_max - esize);
7837             do {
7838                 tlb_fn(env, &scratch[reg_n], reg_off, addr + count_off, ra);
7839                 reg_off += estride;
7840                 count_off += estride;
7841             } while (reg_off <= reg_last);
7842             reg_off = 0;
7843             reg_n++;
7844         } while (count_off <= count_last);
7845 
7846         for (unsigned n = 0; n < N; ++n) {
7847             memcpy(&zd[n * rstride], &scratch[n], reg_max);
7848         }
7849         return;
7850     }
7851 
7852     /* The entire operation is in RAM, on valid pages. */
7853 
7854     for (unsigned n = 0; n < N; ++n) {
7855         memset(&zd[n * rstride], 0, reg_max);
7856     }
7857 
7858     count_off = info.reg_off_first[0];
7859     count_last = info.reg_off_last[0];
7860     reg_off = count_off % reg_max;
7861     reg_n = count_off / reg_max;
7862     host = info.page[0].host;
7863 
7864     set_helper_retaddr(ra);
7865 
7866     do {
7867         reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
7868         do {
7869             host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
7870             reg_off += estride;
7871             count_off += estride;
7872         } while (reg_off <= reg_last);
7873         reg_off = 0;
7874         reg_n++;
7875     } while (count_off <= count_last);
7876 
7877     clear_helper_retaddr();
7878 
7879     /*
7880      * Use the slow path to manage the cross-page misalignment.
7881      * But we know this is RAM and cannot trap.
7882      */
7883     count_off = info.reg_off_split;
7884     if (unlikely(count_off >= 0)) {
7885         reg_off = count_off % reg_max;
7886         reg_n = count_off / reg_max;
7887         tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
7888     }
7889 
7890     count_off = info.reg_off_first[1];
7891     if (unlikely(count_off >= 0)) {
7892         count_last = info.reg_off_last[1];
7893         reg_off = count_off % reg_max;
7894         reg_n = count_off / reg_max;
7895         host = info.page[1].host;
7896 
7897         set_helper_retaddr(ra);
7898 
7899         do {
7900             reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
7901             do {
7902                 host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
7903                 reg_off += estride;
7904                 count_off += estride;
7905             } while (reg_off <= reg_last);
7906             reg_off = 0;
7907             reg_n++;
7908         } while (count_off <= count_last);
7909 
7910         clear_helper_retaddr();
7911     }
7912 }
7913 
7914 void HELPER(sve2p1_ld1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
7915                             uint32_t png, uint32_t desc)
7916 {
7917     sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), MO_8,
7918                  sve_ld1bb_host, sve_ld1bb_tlb);
7919 }
7920 
7921 #define DO_LD1_2(NAME, ESZ)                                             \
7922 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd,           \
7923                                   target_ulong addr, uint32_t png,      \
7924                                   uint32_t desc)                        \
7925 {                                                                       \
7926     sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ,                \
7927                  sve_##NAME##_le_host, sve_##NAME##_le_tlb);            \
7928 }                                                                       \
7929 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd,           \
7930                                   target_ulong addr, uint32_t png,      \
7931                                   uint32_t desc)                        \
7932 {                                                                       \
7933     sve2p1_ld1_c(env, vd, addr, png, desc, GETPC(), ESZ,                \
7934                  sve_##NAME##_be_host, sve_##NAME##_be_tlb);            \
7935 }
7936 
7937 DO_LD1_2(ld1hh, MO_16)
7938 DO_LD1_2(ld1ss, MO_32)
7939 DO_LD1_2(ld1dd, MO_64)
7940 
7941 #undef DO_LD1_2
7942 
7943 static inline QEMU_ALWAYS_INLINE
7944 void sve2p1_st1_c(CPUARMState *env, ARMVectorReg *zd, const vaddr addr,
7945                   uint32_t png, uint32_t desc,
7946                   const uintptr_t ra, const int esz,
7947                   sve_ldst1_host_fn *host_fn,
7948                   sve_ldst1_tlb_fn *tlb_fn)
7949 {
7950     const unsigned N = (desc >> SIMD_DATA_SHIFT) & 1 ? 4 : 2;
7951     const unsigned rstride = 1 << ((desc >> (SIMD_DATA_SHIFT + 1)) % 4);
7952     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7953     const intptr_t reg_max = simd_oprsz(desc);
7954     const unsigned esize = 1 << esz;
7955     intptr_t count_off, count_last;
7956     intptr_t reg_off, reg_last, reg_n;
7957     SVEContLdSt info;
7958     unsigned estride, flags;
7959     void *host;
7960 
7961     estride = sve2p1_cont_ldst_elements(&info, addr, png, reg_max, N, esz);
7962     if (estride == 0) {
7963         /* The entire predicate was false; no store occurs.  */
7964         return;
7965     }
7966 
7967     /* Probe the page(s).  Exit with exception for any invalid page. */
7968     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
7969 
7970     /* Handle watchpoints for all active elements. */
7971     sve2p1_cont_ldst_watchpoints(&info, env, addr, estride,
7972                                  esize, BP_MEM_WRITE, ra);
7973 
7974     /*
7975      * Handle mte checks for all active elements.
7976      * Since TBI must be set for MTE, !mtedesc => !mte_active.
7977      */
7978     if (mtedesc) {
7979         sve2p1_cont_ldst_mte_check(&info, env, estride, addr,
7980                                    esize, mtedesc, ra);
7981     }
7982 
7983     flags = info.page[0].flags | info.page[1].flags;
7984     if (unlikely(flags != 0)) {
7985         /*
7986          * At least one page includes MMIO.
7987          * Any bus operation can fail with cpu_transaction_failed,
7988          * which for ARM will raise SyncExternal.  Perform the load
7989          * into scratch memory to preserve register state until the end.
7990          */
7991         count_off = info.reg_off_first[0];
7992         count_last = info.reg_off_last[1];
7993         if (count_last < 0) {
7994             count_last = info.reg_off_split;
7995             if (count_last < 0) {
7996                 count_last = info.reg_off_last[0];
7997             }
7998         }
7999         reg_off = count_off % reg_max;
8000         reg_n = count_off / reg_max;
8001 
8002         do {
8003             reg_last = MIN(count_last - count_off, reg_max - esize);
8004             do {
8005                 tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
8006                 reg_off += estride;
8007                 count_off += estride;
8008             } while (reg_off <= reg_last);
8009             reg_off = 0;
8010             reg_n++;
8011         } while (count_off <= count_last);
8012         return;
8013     }
8014 
8015     /* The entire operation is in RAM, on valid pages. */
8016 
8017     count_off = info.reg_off_first[0];
8018     count_last = info.reg_off_last[0];
8019     reg_off = count_off % reg_max;
8020     reg_n = count_off / reg_max;
8021     host = info.page[0].host;
8022 
8023     set_helper_retaddr(ra);
8024 
8025     do {
8026         reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
8027         do {
8028             host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
8029             reg_off += estride;
8030             count_off += estride;
8031         } while (reg_off <= reg_last);
8032         reg_off = 0;
8033         reg_n++;
8034     } while (count_off <= count_last);
8035 
8036     clear_helper_retaddr();
8037 
8038     /*
8039      * Use the slow path to manage the cross-page misalignment.
8040      * But we know this is RAM and cannot trap.
8041      */
8042     count_off = info.reg_off_split;
8043     if (unlikely(count_off >= 0)) {
8044         reg_off = count_off % reg_max;
8045         reg_n = count_off / reg_max;
8046         tlb_fn(env, &zd[reg_n * rstride], reg_off, addr + count_off, ra);
8047     }
8048 
8049     count_off = info.reg_off_first[1];
8050     if (unlikely(count_off >= 0)) {
8051         count_last = info.reg_off_last[1];
8052         reg_off = count_off % reg_max;
8053         reg_n = count_off / reg_max;
8054         host = info.page[1].host;
8055 
8056         set_helper_retaddr(ra);
8057 
8058         do {
8059             reg_last = MIN(count_last - reg_n * reg_max, reg_max - esize);
8060             do {
8061                 host_fn(&zd[reg_n * rstride], reg_off, host + count_off);
8062                 reg_off += estride;
8063                 count_off += estride;
8064             } while (reg_off <= reg_last);
8065             reg_off = 0;
8066             reg_n++;
8067         } while (count_off <= count_last);
8068 
8069         clear_helper_retaddr();
8070     }
8071 }
8072 
8073 void HELPER(sve2p1_st1bb_c)(CPUARMState *env, void *vd, target_ulong addr,
8074                            uint32_t png, uint32_t desc)
8075 {
8076     sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), MO_8,
8077                  sve_st1bb_host, sve_st1bb_tlb);
8078 }
8079 
8080 #define DO_ST1_2(NAME, ESZ)                                             \
8081 void HELPER(sve2p1_##NAME##_le_c)(CPUARMState *env, void *vd,           \
8082                                   target_ulong addr, uint32_t png,      \
8083                                   uint32_t desc)                        \
8084 {                                                                       \
8085     sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ,                \
8086                  sve_##NAME##_le_host, sve_##NAME##_le_tlb);            \
8087 }                                                                       \
8088 void HELPER(sve2p1_##NAME##_be_c)(CPUARMState *env, void *vd,           \
8089                                   target_ulong addr, uint32_t png,      \
8090                                   uint32_t desc)                        \
8091 {                                                                       \
8092     sve2p1_st1_c(env, vd, addr, png, desc, GETPC(), ESZ,                \
8093                  sve_##NAME##_be_host, sve_##NAME##_be_tlb);            \
8094 }
8095 
8096 DO_ST1_2(st1hh, MO_16)
8097 DO_ST1_2(st1ss, MO_32)
8098 DO_ST1_2(st1dd, MO_64)
8099 
8100 #undef DO_ST1_2
8101 
8102 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8103 {
8104     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8105     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8106 
8107     for (i = 0; i < opr_sz; ++i) {
8108         d[i] = n[i] ^ m[i] ^ k[i];
8109     }
8110 }
8111 
8112 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8113 {
8114     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8115     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8116 
8117     for (i = 0; i < opr_sz; ++i) {
8118         d[i] = n[i] ^ (m[i] & ~k[i]);
8119     }
8120 }
8121 
8122 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8123 {
8124     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8125     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8126 
8127     for (i = 0; i < opr_sz; ++i) {
8128         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
8129     }
8130 }
8131 
8132 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8133 {
8134     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8135     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8136 
8137     for (i = 0; i < opr_sz; ++i) {
8138         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
8139     }
8140 }
8141 
8142 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
8143 {
8144     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8145     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
8146 
8147     for (i = 0; i < opr_sz; ++i) {
8148         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
8149     }
8150 }
8151 
8152 /*
8153  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
8154  * See hasless(v,1) from
8155  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
8156  */
8157 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
8158 {
8159     int bits = 8 << esz;
8160     uint64_t ones = dup_const(esz, 1);
8161     uint64_t signs = ones << (bits - 1);
8162     uint64_t cmp0, cmp1;
8163 
8164     cmp1 = dup_const(esz, n);
8165     cmp0 = cmp1 ^ m0;
8166     cmp1 = cmp1 ^ m1;
8167     cmp0 = (cmp0 - ones) & ~cmp0;
8168     cmp1 = (cmp1 - ones) & ~cmp1;
8169     return (cmp0 | cmp1) & signs;
8170 }
8171 
8172 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
8173                                 uint32_t desc, int esz, bool nmatch)
8174 {
8175     uint16_t esz_mask = pred_esz_masks[esz];
8176     intptr_t opr_sz = simd_oprsz(desc);
8177     uint32_t flags = PREDTEST_INIT;
8178     intptr_t i, j, k;
8179 
8180     for (i = 0; i < opr_sz; i += 16) {
8181         uint64_t m0 = *(uint64_t *)(vm + i);
8182         uint64_t m1 = *(uint64_t *)(vm + i + 8);
8183         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
8184         uint16_t out = 0;
8185 
8186         for (j = 0; j < 16; j += 8) {
8187             uint64_t n = *(uint64_t *)(vn + i + j);
8188 
8189             for (k = 0; k < 8; k += 1 << esz) {
8190                 if (pg & (1 << (j + k))) {
8191                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
8192                     out |= (o ^ nmatch) << (j + k);
8193                 }
8194             }
8195         }
8196         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
8197         flags = iter_predtest_fwd(out, pg, flags);
8198     }
8199     return flags;
8200 }
8201 
8202 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
8203 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
8204 {                                                                             \
8205     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
8206 }
8207 
8208 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
8209 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
8210 
8211 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
8212 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
8213 
8214 #undef DO_PPZZ_MATCH
8215 
8216 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
8217                             uint32_t desc)
8218 {
8219     ARMVectorReg scratch;
8220     intptr_t i, j;
8221     intptr_t opr_sz = simd_oprsz(desc);
8222     uint32_t *d = vd, *n = vn, *m = vm;
8223     uint8_t *pg = vg;
8224 
8225     if (d == n) {
8226         n = memcpy(&scratch, n, opr_sz);
8227         if (d == m) {
8228             m = n;
8229         }
8230     } else if (d == m) {
8231         m = memcpy(&scratch, m, opr_sz);
8232     }
8233 
8234     for (i = 0; i < opr_sz; i += 4) {
8235         uint64_t count = 0;
8236         uint8_t pred;
8237 
8238         pred = pg[H1(i >> 3)] >> (i & 7);
8239         if (pred & 1) {
8240             uint32_t nn = n[H4(i >> 2)];
8241 
8242             for (j = 0; j <= i; j += 4) {
8243                 pred = pg[H1(j >> 3)] >> (j & 7);
8244                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
8245                     ++count;
8246                 }
8247             }
8248         }
8249         d[H4(i >> 2)] = count;
8250     }
8251 }
8252 
8253 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
8254                             uint32_t desc)
8255 {
8256     ARMVectorReg scratch;
8257     intptr_t i, j;
8258     intptr_t opr_sz = simd_oprsz(desc);
8259     uint64_t *d = vd, *n = vn, *m = vm;
8260     uint8_t *pg = vg;
8261 
8262     if (d == n) {
8263         n = memcpy(&scratch, n, opr_sz);
8264         if (d == m) {
8265             m = n;
8266         }
8267     } else if (d == m) {
8268         m = memcpy(&scratch, m, opr_sz);
8269     }
8270 
8271     for (i = 0; i < opr_sz / 8; ++i) {
8272         uint64_t count = 0;
8273         if (pg[H1(i)] & 1) {
8274             uint64_t nn = n[i];
8275             for (j = 0; j <= i; ++j) {
8276                 if ((pg[H1(j)] & 1) && nn == m[j]) {
8277                     ++count;
8278                 }
8279             }
8280         }
8281         d[i] = count;
8282     }
8283 }
8284 
8285 /*
8286  * Returns the number of bytes in m0 and m1 that match n.
8287  * Unlike do_match2 we don't just need true/false, we need an exact count.
8288  * This requires two extra logical operations.
8289  */
8290 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
8291 {
8292     const uint64_t mask = dup_const(MO_8, 0x7f);
8293     uint64_t cmp0, cmp1;
8294 
8295     cmp1 = dup_const(MO_8, n);
8296     cmp0 = cmp1 ^ m0;
8297     cmp1 = cmp1 ^ m1;
8298 
8299     /*
8300      * 1: clear msb of each byte to avoid carry to next byte (& mask)
8301      * 2: carry in to msb if byte != 0 (+ mask)
8302      * 3: set msb if cmp has msb set (| cmp)
8303      * 4: set ~msb to ignore them (| mask)
8304      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
8305      * 5: invert, resulting in 0x80 if and only if byte == 0.
8306      */
8307     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
8308     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
8309 
8310     /*
8311      * Combine the two compares in a way that the bits do
8312      * not overlap, and so preserves the count of set bits.
8313      * If the host has an efficient instruction for ctpop,
8314      * then ctpop(x) + ctpop(y) has the same number of
8315      * operations as ctpop(x | (y >> 1)).  If the host does
8316      * not have an efficient ctpop, then we only want to
8317      * use it once.
8318      */
8319     return ctpop64(cmp0 | (cmp1 >> 1));
8320 }
8321 
8322 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
8323 {
8324     intptr_t i, j;
8325     intptr_t opr_sz = simd_oprsz(desc);
8326 
8327     for (i = 0; i < opr_sz; i += 16) {
8328         uint64_t n0 = *(uint64_t *)(vn + i);
8329         uint64_t m0 = *(uint64_t *)(vm + i);
8330         uint64_t n1 = *(uint64_t *)(vn + i + 8);
8331         uint64_t m1 = *(uint64_t *)(vm + i + 8);
8332         uint64_t out0 = 0;
8333         uint64_t out1 = 0;
8334 
8335         for (j = 0; j < 64; j += 8) {
8336             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
8337             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
8338             out0 |= cnt0 << j;
8339             out1 |= cnt1 << j;
8340         }
8341 
8342         *(uint64_t *)(vd + i) = out0;
8343         *(uint64_t *)(vd + i + 8) = out1;
8344     }
8345 }
8346 
8347 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
8348 {
8349     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8350     int shr = simd_data(desc);
8351     int shl = 8 - shr;
8352     uint64_t mask = dup_const(MO_8, 0xff >> shr);
8353     uint64_t *d = vd, *n = vn, *m = vm;
8354 
8355     for (i = 0; i < opr_sz; ++i) {
8356         uint64_t t = n[i] ^ m[i];
8357         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
8358     }
8359 }
8360 
8361 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
8362 {
8363     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
8364     int shr = simd_data(desc);
8365     int shl = 16 - shr;
8366     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
8367     uint64_t *d = vd, *n = vn, *m = vm;
8368 
8369     for (i = 0; i < opr_sz; ++i) {
8370         uint64_t t = n[i] ^ m[i];
8371         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
8372     }
8373 }
8374 
8375 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
8376 {
8377     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
8378     int shr = simd_data(desc);
8379     uint32_t *d = vd, *n = vn, *m = vm;
8380 
8381     for (i = 0; i < opr_sz; ++i) {
8382         d[i] = ror32(n[i] ^ m[i], shr);
8383     }
8384 }
8385 
8386 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
8387                      float_status *status, uint32_t desc)
8388 {
8389     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
8390 
8391     for (s = 0; s < opr_sz; ++s) {
8392         float32 *n = vn + s * sizeof(float32) * 4;
8393         float32 *m = vm + s * sizeof(float32) * 4;
8394         float32 *a = va + s * sizeof(float32) * 4;
8395         float32 *d = vd + s * sizeof(float32) * 4;
8396         float32 n00 = n[H4(0)], n01 = n[H4(1)];
8397         float32 n10 = n[H4(2)], n11 = n[H4(3)];
8398         float32 m00 = m[H4(0)], m01 = m[H4(1)];
8399         float32 m10 = m[H4(2)], m11 = m[H4(3)];
8400         float32 p0, p1;
8401 
8402         /* i = 0, j = 0 */
8403         p0 = float32_mul(n00, m00, status);
8404         p1 = float32_mul(n01, m01, status);
8405         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
8406 
8407         /* i = 0, j = 1 */
8408         p0 = float32_mul(n00, m10, status);
8409         p1 = float32_mul(n01, m11, status);
8410         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
8411 
8412         /* i = 1, j = 0 */
8413         p0 = float32_mul(n10, m00, status);
8414         p1 = float32_mul(n11, m01, status);
8415         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
8416 
8417         /* i = 1, j = 1 */
8418         p0 = float32_mul(n10, m10, status);
8419         p1 = float32_mul(n11, m11, status);
8420         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
8421     }
8422 }
8423 
8424 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
8425                      float_status *status, uint32_t desc)
8426 {
8427     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
8428 
8429     for (s = 0; s < opr_sz; ++s) {
8430         float64 *n = vn + s * sizeof(float64) * 4;
8431         float64 *m = vm + s * sizeof(float64) * 4;
8432         float64 *a = va + s * sizeof(float64) * 4;
8433         float64 *d = vd + s * sizeof(float64) * 4;
8434         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
8435         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
8436         float64 p0, p1;
8437 
8438         /* i = 0, j = 0 */
8439         p0 = float64_mul(n00, m00, status);
8440         p1 = float64_mul(n01, m01, status);
8441         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
8442 
8443         /* i = 0, j = 1 */
8444         p0 = float64_mul(n00, m10, status);
8445         p1 = float64_mul(n01, m11, status);
8446         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
8447 
8448         /* i = 1, j = 0 */
8449         p0 = float64_mul(n10, m00, status);
8450         p1 = float64_mul(n11, m01, status);
8451         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
8452 
8453         /* i = 1, j = 1 */
8454         p0 = float64_mul(n10, m10, status);
8455         p1 = float64_mul(n11, m11, status);
8456         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
8457     }
8458 }
8459 
8460 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
8461 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
8462                   float_status *status, uint32_t desc)                        \
8463 {                                                                             \
8464     intptr_t i = simd_oprsz(desc);                                            \
8465     uint64_t *g = vg;                                                         \
8466     do {                                                                      \
8467         uint64_t pg = g[(i - 1) >> 6];                                        \
8468         do {                                                                  \
8469             i -= sizeof(TYPEW);                                               \
8470             if (likely((pg >> (i & 63)) & 1)) {                               \
8471                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
8472                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
8473             }                                                                 \
8474         } while (i & 63);                                                     \
8475     } while (i != 0);                                                         \
8476 }
8477 
8478 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
8479 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
8480 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
8481 
8482 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
8483 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
8484                   float_status *status, uint32_t desc)                        \
8485 {                                                                             \
8486     intptr_t i = simd_oprsz(desc);                                            \
8487     uint64_t *g = vg;                                                         \
8488     do {                                                                      \
8489         uint64_t pg = g[(i - 1) >> 6];                                        \
8490         do {                                                                  \
8491             i -= sizeof(TYPEW);                                               \
8492             if (likely((pg >> (i & 63)) & 1)) {                               \
8493                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
8494                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
8495             }                                                                 \
8496         } while (i & 63);                                                     \
8497     } while (i != 0);                                                         \
8498 }
8499 
8500 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
8501 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
8502 
8503 #undef DO_FCVTLT
8504 #undef DO_FCVTNT
8505 
8506 void HELPER(pext)(void *vd, uint32_t png, uint32_t desc)
8507 {
8508     int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
8509     int vl = pl * 8;
8510     unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
8511     int part = FIELD_EX32(desc, PREDDESC, DATA);
8512     DecodeCounter p = decode_counter(png, vl, v_esz);
8513     uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride];
8514     ARMPredicateReg *d = vd;
8515 
8516     /*
8517      * Convert from element count to byte count and adjust
8518      * for the portion of the 4*VL counter to be extracted.
8519      */
8520     int b_count = (p.count << v_esz) - vl * part;
8521 
8522     memset(d, 0, sizeof(*d));
8523     if (p.invert) {
8524         if (b_count <= 0) {
8525             do_whilel(vd, mask, vl, vl);
8526         } else if (b_count < vl) {
8527             do_whileg(vd, mask, vl - b_count, vl);
8528         }
8529     } else if (b_count > 0) {
8530         do_whilel(vd, mask, MIN(b_count, vl), vl);
8531     }
8532 }
8533