xref: /qemu/target/arm/tcg/sve_helper.c (revision b103cc6e74ac92f070a0e004bd84334e845c20b5)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "exec/tlb-flags.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg.h"
30 #include "vec_internal.h"
31 #include "sve_ldst_internal.h"
32 #include "accel/tcg/cpu-ops.h"
33 #ifdef CONFIG_USER_ONLY
34 #include "user/page-protection.h"
35 #endif
36 
37 
38 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
39  *
40  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
41  * and bit 0 set if C is set.  Compare the definitions of these variables
42  * within CPUARMState.
43  */
44 
45 /* For no G bits set, NZCV = C.  */
46 #define PREDTEST_INIT  1
47 
48 /* This is an iterative function, called for each Pd and Pg word
49  * moving forward.
50  */
51 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
52 {
53     if (likely(g)) {
54         /* Compute N from first D & G.
55            Use bit 2 to signal first G bit seen.  */
56         if (!(flags & 4)) {
57             flags |= ((d & (g & -g)) != 0) << 31;
58             flags |= 4;
59         }
60 
61         /* Accumulate Z from each D & G.  */
62         flags |= ((d & g) != 0) << 1;
63 
64         /* Compute C from last !(D & G).  Replace previous.  */
65         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
66     }
67     return flags;
68 }
69 
70 /* This is an iterative function, called for each Pd and Pg word
71  * moving backward.
72  */
73 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
74 {
75     if (likely(g)) {
76         /* Compute C from first (i.e last) !(D & G).
77            Use bit 2 to signal first G bit seen.  */
78         if (!(flags & 4)) {
79             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
80             flags |= (d & pow2floor(g)) == 0;
81         }
82 
83         /* Accumulate Z from each D & G.  */
84         flags |= ((d & g) != 0) << 1;
85 
86         /* Compute N from last (i.e first) D & G.  Replace previous.  */
87         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
88     }
89     return flags;
90 }
91 
92 /* The same for a single word predicate.  */
93 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
94 {
95     return iter_predtest_fwd(d, g, PREDTEST_INIT);
96 }
97 
98 /* The same for a multi-word predicate.  */
99 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
100 {
101     uint32_t flags = PREDTEST_INIT;
102     uint64_t *d = vd, *g = vg;
103     uintptr_t i = 0;
104 
105     do {
106         flags = iter_predtest_fwd(d[i], g[i], flags);
107     } while (++i < words);
108 
109     return flags;
110 }
111 
112 /* Similarly for single word elements.  */
113 static inline uint64_t expand_pred_s(uint8_t byte)
114 {
115     static const uint64_t word[] = {
116         [0x01] = 0x00000000ffffffffull,
117         [0x10] = 0xffffffff00000000ull,
118         [0x11] = 0xffffffffffffffffull,
119     };
120     return word[byte & 0x11];
121 }
122 
123 #define LOGICAL_PPPP(NAME, FUNC) \
124 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
125 {                                                                         \
126     uintptr_t opr_sz = simd_oprsz(desc);                                  \
127     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
128     uintptr_t i;                                                          \
129     for (i = 0; i < opr_sz / 8; ++i) {                                    \
130         d[i] = FUNC(n[i], m[i], g[i]);                                    \
131     }                                                                     \
132 }
133 
134 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
135 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
136 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
137 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
138 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
139 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
140 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
141 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
142 
143 LOGICAL_PPPP(sve_and_pppp, DO_AND)
144 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
145 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
146 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
147 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
148 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
149 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
150 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
151 
152 #undef DO_AND
153 #undef DO_BIC
154 #undef DO_EOR
155 #undef DO_ORR
156 #undef DO_ORN
157 #undef DO_NOR
158 #undef DO_NAND
159 #undef DO_SEL
160 #undef LOGICAL_PPPP
161 
162 /* Fully general three-operand expander, controlled by a predicate.
163  * This is complicated by the host-endian storage of the register file.
164  */
165 /* ??? I don't expect the compiler could ever vectorize this itself.
166  * With some tables we can convert bit masks to byte masks, and with
167  * extra care wrt byte/word ordering we could use gcc generic vectors
168  * and do 16 bytes at a time.
169  */
170 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
171 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
172 {                                                                       \
173     intptr_t i, opr_sz = simd_oprsz(desc);                              \
174     for (i = 0; i < opr_sz; ) {                                         \
175         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
176         do {                                                            \
177             if (pg & 1) {                                               \
178                 TYPE nn = *(TYPE *)(vn + H(i));                         \
179                 TYPE mm = *(TYPE *)(vm + H(i));                         \
180                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
181             }                                                           \
182             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
183         } while (i & 15);                                               \
184     }                                                                   \
185 }
186 
187 /* Similarly, specialized for 64-bit operands.  */
188 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
189 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
190 {                                                               \
191     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
192     TYPE *d = vd, *n = vn, *m = vm;                             \
193     uint8_t *pg = vg;                                           \
194     for (i = 0; i < opr_sz; i += 1) {                           \
195         if (pg[H1(i)] & 1) {                                    \
196             TYPE nn = n[i], mm = m[i];                          \
197             d[i] = OP(nn, mm);                                  \
198         }                                                       \
199     }                                                           \
200 }
201 
202 #define DO_AND(N, M)  (N & M)
203 #define DO_EOR(N, M)  (N ^ M)
204 #define DO_ORR(N, M)  (N | M)
205 #define DO_BIC(N, M)  (N & ~M)
206 #define DO_ADD(N, M)  (N + M)
207 #define DO_SUB(N, M)  (N - M)
208 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
209 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
210 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
211 #define DO_MUL(N, M)  (N * M)
212 
213 
214 /*
215  * We must avoid the C undefined behaviour cases: division by
216  * zero and signed division of INT_MIN by -1. Both of these
217  * have architecturally defined required results for Arm.
218  * We special case all signed divisions by -1 to avoid having
219  * to deduce the minimum integer for the type involved.
220  */
221 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
222 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
223 
224 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
225 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
226 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
227 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
228 
229 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
230 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
231 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
232 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
233 
234 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
235 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
236 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
237 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
238 
239 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
240 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
241 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
242 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
243 
244 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
245 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
246 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
247 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
248 
249 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
250 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
251 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
252 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
253 
254 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
255 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
256 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
257 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
258 
259 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
260 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
261 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
262 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
263 
264 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
265 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
266 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
267 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
268 
269 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
270 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
271 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
272 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
273 
274 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
275 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
276 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
277 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
278 
279 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
280 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
281 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
282 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
283 
284 /* Because the computation type is at least twice as large as required,
285    these work for both signed and unsigned source types.  */
286 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
287 {
288     return (n * m) >> 8;
289 }
290 
291 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
292 {
293     return (n * m) >> 16;
294 }
295 
296 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
297 {
298     return (n * m) >> 32;
299 }
300 
301 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
302 {
303     uint64_t lo, hi;
304     muls64(&lo, &hi, n, m);
305     return hi;
306 }
307 
308 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
309 {
310     uint64_t lo, hi;
311     mulu64(&lo, &hi, n, m);
312     return hi;
313 }
314 
315 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
316 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
317 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
318 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
319 
320 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
321 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
322 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
323 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
324 
325 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
326 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
327 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
328 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
329 
330 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
331 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
332 
333 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
334 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
335 
336 /* Note that all bits of the shift are significant
337    and not modulo the element size.  */
338 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
339 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
340 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
341 
342 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
343 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
344 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
345 
346 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
347 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
348 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
349 
350 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
351 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
352 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
353 
354 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
355 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
356 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
357 
358 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
359 {
360     int8_t n1 = n, n2 = n >> 8;
361     return m + n1 + n2;
362 }
363 
364 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
365 {
366     int16_t n1 = n, n2 = n >> 16;
367     return m + n1 + n2;
368 }
369 
370 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
371 {
372     int32_t n1 = n, n2 = n >> 32;
373     return m + n1 + n2;
374 }
375 
376 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
377 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
378 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
379 
380 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
381 {
382     uint8_t n1 = n, n2 = n >> 8;
383     return m + n1 + n2;
384 }
385 
386 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
387 {
388     uint16_t n1 = n, n2 = n >> 16;
389     return m + n1 + n2;
390 }
391 
392 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
393 {
394     uint32_t n1 = n, n2 = n >> 32;
395     return m + n1 + n2;
396 }
397 
398 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
399 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
400 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
401 
402 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
403 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
404 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
405 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
406 
407 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
408 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
409 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
410 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
411 
412 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
413 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
414 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
415 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
416 
417 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
418 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
419 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
420 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
421 
422 /*
423  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
424  * We pass in a pointer to a dummy saturation field to trigger
425  * the saturating arithmetic but discard the information about
426  * whether it has occurred.
427  */
428 #define do_sqshl_b(n, m) \
429    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
430 #define do_sqshl_h(n, m) \
431    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
432 #define do_sqshl_s(n, m) \
433    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
434 #define do_sqshl_d(n, m) \
435    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
436 
437 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
438 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
439 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
440 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
441 
442 #define do_uqshl_b(n, m) \
443    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
444 #define do_uqshl_h(n, m) \
445    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
446 #define do_uqshl_s(n, m) \
447    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
448 #define do_uqshl_d(n, m) \
449    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
450 
451 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
452 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
453 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
454 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
455 
456 #define do_sqrshl_b(n, m) \
457    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
458 #define do_sqrshl_h(n, m) \
459    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
460 #define do_sqrshl_s(n, m) \
461    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
462 #define do_sqrshl_d(n, m) \
463    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
464 
465 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
466 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
467 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
468 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
469 
470 #undef do_sqrshl_d
471 
472 #define do_uqrshl_b(n, m) \
473    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
474 #define do_uqrshl_h(n, m) \
475    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
476 #define do_uqrshl_s(n, m) \
477    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
478 #define do_uqrshl_d(n, m) \
479    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
480 
481 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
482 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
483 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
484 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
485 
486 #undef do_uqrshl_d
487 
488 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
489 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
490 
491 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
492 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
493 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
494 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
495 
496 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
497 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
498 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
499 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
500 
501 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
502 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
503 
504 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
506 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
507 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
508 
509 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
510 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
511 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
512 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
513 
514 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
515 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
516 
517 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
519 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
520 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
521 
522 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
523 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
524 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
525 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
526 
527 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
528 {
529     return val >= max ? max : val <= min ? min : val;
530 }
531 
532 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
533 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
534 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
535 
536 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
537 {
538     int64_t r = n + m;
539     if (((r ^ n) & ~(n ^ m)) < 0) {
540         /* Signed overflow.  */
541         return r < 0 ? INT64_MAX : INT64_MIN;
542     }
543     return r;
544 }
545 
546 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
547 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
548 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
549 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
550 
551 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
552 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
553 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
554 
555 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
556 {
557     uint64_t r = n + m;
558     return r < n ? UINT64_MAX : r;
559 }
560 
561 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
562 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
563 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
564 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
565 
566 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
567 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
568 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
569 
570 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
571 {
572     int64_t r = n - m;
573     if (((r ^ n) & (n ^ m)) < 0) {
574         /* Signed overflow.  */
575         return r < 0 ? INT64_MAX : INT64_MIN;
576     }
577     return r;
578 }
579 
580 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
581 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
582 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
583 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
584 
585 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
586 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
587 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
588 
589 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
590 {
591     return n > m ? n - m : 0;
592 }
593 
594 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
595 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
596 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
597 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
598 
599 #define DO_SUQADD_B(n, m) \
600     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
601 #define DO_SUQADD_H(n, m) \
602     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
603 #define DO_SUQADD_S(n, m) \
604     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
605 
606 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
607 {
608     uint64_t r = n + m;
609 
610     if (n < 0) {
611         /* Note that m - abs(n) cannot underflow. */
612         if (r > INT64_MAX) {
613             /* Result is either very large positive or negative. */
614             if (m > -n) {
615                 /* m > abs(n), so r is a very large positive. */
616                 return INT64_MAX;
617             }
618             /* Result is negative. */
619         }
620     } else {
621         /* Both inputs are positive: check for overflow.  */
622         if (r < m || r > INT64_MAX) {
623             return INT64_MAX;
624         }
625     }
626     return r;
627 }
628 
629 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
630 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
631 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
632 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
633 
634 #define DO_USQADD_B(n, m) \
635     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
636 #define DO_USQADD_H(n, m) \
637     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
638 #define DO_USQADD_S(n, m) \
639     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
640 
641 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
642 {
643     uint64_t r = n + m;
644 
645     if (m < 0) {
646         return n < -m ? 0 : r;
647     }
648     return r < n ? UINT64_MAX : r;
649 }
650 
651 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
652 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
653 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
654 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
655 
656 #undef DO_ZPZZ
657 #undef DO_ZPZZ_D
658 
659 /*
660  * Three operand expander, operating on element pairs.
661  * If the slot I is even, the elements from from VN {I, I+1}.
662  * If the slot I is odd, the elements from from VM {I-1, I}.
663  * Load all of the input elements in each pair before overwriting output.
664  */
665 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
666 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
667 {                                                               \
668     intptr_t i, opr_sz = simd_oprsz(desc);                      \
669     for (i = 0; i < opr_sz; ) {                                 \
670         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
671         do {                                                    \
672             TYPE n0 = *(TYPE *)(vn + H(i));                     \
673             TYPE m0 = *(TYPE *)(vm + H(i));                     \
674             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
675             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
676             if (pg & 1) {                                       \
677                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
678             }                                                   \
679             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
680             if (pg & 1) {                                       \
681                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
682             }                                                   \
683             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
684         } while (i & 15);                                       \
685     }                                                           \
686 }
687 
688 /* Similarly, specialized for 64-bit operands.  */
689 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
691 {                                                               \
692     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
693     TYPE *d = vd, *n = vn, *m = vm;                             \
694     uint8_t *pg = vg;                                           \
695     for (i = 0; i < opr_sz; i += 2) {                           \
696         TYPE n0 = n[i], n1 = n[i + 1];                          \
697         TYPE m0 = m[i], m1 = m[i + 1];                          \
698         if (pg[H1(i)] & 1) {                                    \
699             d[i] = OP(n0, n1);                                  \
700         }                                                       \
701         if (pg[H1(i + 1)] & 1) {                                \
702             d[i + 1] = OP(m0, m1);                              \
703         }                                                       \
704     }                                                           \
705 }
706 
707 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
708 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
709 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
710 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
711 
712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
714 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
715 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
716 
717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
719 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
720 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
721 
722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
724 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
725 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
726 
727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
729 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
730 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
731 
732 #undef DO_ZPZZ_PAIR
733 #undef DO_ZPZZ_PAIR_D
734 
735 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
736 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
737                   float_status *status, uint32_t desc)                  \
738 {                                                                       \
739     intptr_t i, opr_sz = simd_oprsz(desc);                              \
740     for (i = 0; i < opr_sz; ) {                                         \
741         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
742         do {                                                            \
743             TYPE n0 = *(TYPE *)(vn + H(i));                             \
744             TYPE m0 = *(TYPE *)(vm + H(i));                             \
745             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
746             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
747             if (pg & 1) {                                               \
748                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
749             }                                                           \
750             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
751             if (pg & 1) {                                               \
752                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
753             }                                                           \
754             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
755         } while (i & 15);                                               \
756     }                                                                   \
757 }
758 
759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
761 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
762 
763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
765 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
766 
767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
769 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
770 
771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
773 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
774 
775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
777 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
778 
779 #undef DO_ZPZZ_PAIR_FP
780 
781 /* Three-operand expander, controlled by a predicate, in which the
782  * third operand is "wide".  That is, for D = N op M, the same 64-bit
783  * value of M is used with all of the narrower values of N.
784  */
785 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
786 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
787 {                                                                       \
788     intptr_t i, opr_sz = simd_oprsz(desc);                              \
789     for (i = 0; i < opr_sz; ) {                                         \
790         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
791         TYPEW mm = *(TYPEW *)(vm + i);                                  \
792         do {                                                            \
793             if (pg & 1) {                                               \
794                 TYPE nn = *(TYPE *)(vn + H(i));                         \
795                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
796             }                                                           \
797             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
798         } while (i & 7);                                                \
799     }                                                                   \
800 }
801 
802 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
803 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
804 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
805 
806 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
807 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
808 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
809 
810 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
811 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
812 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
813 
814 #undef DO_ZPZW
815 
816 /* Fully general two-operand expander, controlled by a predicate.
817  */
818 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
819 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
820 {                                                               \
821     intptr_t i, opr_sz = simd_oprsz(desc);                      \
822     for (i = 0; i < opr_sz; ) {                                 \
823         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
824         do {                                                    \
825             if (pg & 1) {                                       \
826                 TYPE nn = *(TYPE *)(vn + H(i));                 \
827                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
828             }                                                   \
829             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
830         } while (i & 15);                                       \
831     }                                                           \
832 }
833 
834 /* Similarly, specialized for 64-bit operands.  */
835 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
836 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
837 {                                                               \
838     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
839     TYPE *d = vd, *n = vn;                                      \
840     uint8_t *pg = vg;                                           \
841     for (i = 0; i < opr_sz; i += 1) {                           \
842         if (pg[H1(i)] & 1) {                                    \
843             TYPE nn = n[i];                                     \
844             d[i] = OP(nn);                                      \
845         }                                                       \
846     }                                                           \
847 }
848 
849 #define DO_CLS_B(N)   (clrsb32(N) - 24)
850 #define DO_CLS_H(N)   (clrsb32(N) - 16)
851 
852 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
853 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
854 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
855 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
856 
857 #define DO_CLZ_B(N)   (clz32(N) - 24)
858 #define DO_CLZ_H(N)   (clz32(N) - 16)
859 
860 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
861 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
862 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
863 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
864 
865 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
866 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
867 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
868 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
869 
870 #define DO_CNOT(N)    (N == 0)
871 
872 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
873 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
874 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
875 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
876 
877 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
878 
879 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
880 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
881 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
882 
883 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
884 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
885 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
886 
887 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
888 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
889 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
890 
891 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
892 
893 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
894 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
895 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
896 
897 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
898 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
899 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
900 
901 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
902 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
903 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
904 
905 #define DO_NOT(N)    (~N)
906 
907 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
908 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
909 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
910 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
911 
912 #define DO_SXTB(N)    ((int8_t)N)
913 #define DO_SXTH(N)    ((int16_t)N)
914 #define DO_SXTS(N)    ((int32_t)N)
915 #define DO_UXTB(N)    ((uint8_t)N)
916 #define DO_UXTH(N)    ((uint16_t)N)
917 #define DO_UXTS(N)    ((uint32_t)N)
918 
919 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
920 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
921 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
922 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
923 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
924 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
925 
926 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
927 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
928 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
929 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
930 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
931 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
932 
933 #define DO_ABS(N)    (N < 0 ? -N : N)
934 
935 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
936 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
937 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
938 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
939 
940 #define DO_NEG(N)    (-N)
941 
942 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
943 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
944 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
945 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
946 
947 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
948 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
949 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
950 
951 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
952 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
953 
954 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
955 
956 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
957 {
958     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
959     uint64_t *d = vd, *n = vn;
960     uint8_t *pg = vg;
961 
962     for (i = 0; i < opr_sz; i += 2) {
963         if (pg[H1(i)] & 1) {
964             uint64_t n0 = n[i + 0];
965             uint64_t n1 = n[i + 1];
966             d[i + 0] = n1;
967             d[i + 1] = n0;
968         }
969     }
970 }
971 
972 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
973 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
974 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
975 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
976 
977 #define DO_SQABS(X) \
978     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
979        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
980 
981 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
982 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
983 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
984 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
985 
986 #define DO_SQNEG(X) \
987     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
988        x_ == min_ ? -min_ - 1 : -x_; })
989 
990 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
991 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
992 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
993 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
994 
995 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
996 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
997 
998 /* Three-operand expander, unpredicated, in which the third operand is "wide".
999  */
1000 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
1001 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1002 {                                                              \
1003     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1004     for (i = 0; i < opr_sz; ) {                                \
1005         TYPEW mm = *(TYPEW *)(vm + i);                         \
1006         do {                                                   \
1007             TYPE nn = *(TYPE *)(vn + H(i));                    \
1008             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
1009             i += sizeof(TYPE);                                 \
1010         } while (i & 7);                                       \
1011     }                                                          \
1012 }
1013 
1014 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1015 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1016 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1017 
1018 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1019 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1020 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1021 
1022 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1023 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1024 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1025 
1026 #undef DO_ZZW
1027 
1028 #undef DO_CLS_B
1029 #undef DO_CLS_H
1030 #undef DO_CLZ_B
1031 #undef DO_CLZ_H
1032 #undef DO_CNOT
1033 #undef DO_FABS
1034 #undef DO_FNEG
1035 #undef DO_ABS
1036 #undef DO_NEG
1037 #undef DO_ZPZ
1038 #undef DO_ZPZ_D
1039 
1040 /*
1041  * Three-operand expander, unpredicated, in which the two inputs are
1042  * selected from the top or bottom half of the wide column.
1043  */
1044 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1045 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1046 {                                                                       \
1047     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1048     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1049     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1050     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1051         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1052         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1053         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1054     }                                                                   \
1055 }
1056 
1057 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1058 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1059 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1060 
1061 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1062 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1063 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1064 
1065 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1066 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1067 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1068 
1069 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1070 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1071 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1072 
1073 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1074 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1075 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1076 
1077 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1078 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1079 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1080 
1081 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1082 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1083 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1084 
1085 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1086 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1087 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1088 
1089 /* Note that the multiply cannot overflow, but the doubling can. */
1090 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1091 {
1092     int16_t val = n * m;
1093     return DO_SQADD_H(val, val);
1094 }
1095 
1096 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1097 {
1098     int32_t val = n * m;
1099     return DO_SQADD_S(val, val);
1100 }
1101 
1102 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1103 {
1104     int64_t val = n * m;
1105     return do_sqadd_d(val, val);
1106 }
1107 
1108 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1109 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1110 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1111 
1112 #undef DO_ZZZ_TB
1113 
1114 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1115 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1116 {                                                              \
1117     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1118     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1119     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1120         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1121         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1122         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1123     }                                                          \
1124 }
1125 
1126 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1127 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1128 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1129 
1130 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1131 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1132 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1133 
1134 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1135 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1136 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1137 
1138 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1139 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1140 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1141 
1142 #undef DO_ZZZ_WTB
1143 
1144 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1145 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1146 {                                                                       \
1147     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1148     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1149     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1150     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1151         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1152         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1153         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1154     }                                                                   \
1155 }
1156 
1157 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1158 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1159 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1160 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1161 
1162 #undef DO_ZZZ_NTB
1163 
1164 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1165 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1166 {                                                               \
1167     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1168     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1169     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1170         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1171         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1172         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1173         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1174     }                                                           \
1175 }
1176 
1177 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1178 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1179 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1180 
1181 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1182 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1183 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1184 
1185 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1186 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1187 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1188 
1189 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1190 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1191 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1192 
1193 #define DO_NMUL(N, M)  -(N * M)
1194 
1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1196 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1197 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1198 
1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1200 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1201 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1202 
1203 #undef DO_ZZZW_ACC
1204 
1205 #define DO_XTNB(NAME, TYPE, OP) \
1206 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1207 {                                                            \
1208     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1209     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1210         TYPE nn = *(TYPE *)(vn + i);                         \
1211         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1212         *(TYPE *)(vd + i) = nn;                              \
1213     }                                                        \
1214 }
1215 
1216 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1217 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1218 {                                                                       \
1219     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1220     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1221         TYPE nn = *(TYPE *)(vn + i);                                    \
1222         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1223     }                                                                   \
1224 }
1225 
1226 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1227 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1228 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1229 
1230 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1231 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1232 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1233 
1234 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1235 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1236 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1237 
1238 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1239 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1240 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1241 
1242 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1243 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1244 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1245 
1246 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1247 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1248 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1249 
1250 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1251 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1252 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1253 
1254 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1255 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1256 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1257 
1258 #undef DO_XTNB
1259 #undef DO_XTNT
1260 
1261 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1262 {
1263     intptr_t i, opr_sz = simd_oprsz(desc);
1264     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1265     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1266     uint32_t *a = va, *n = vn;
1267     uint64_t *d = vd, *m = vm;
1268 
1269     for (i = 0; i < opr_sz / 8; ++i) {
1270         uint32_t e1 = a[2 * i + H4(0)];
1271         uint32_t e2 = n[2 * i + sel] ^ inv;
1272         uint64_t c = extract64(m[i], 32, 1);
1273         /* Compute and store the entire 33-bit result at once. */
1274         d[i] = c + e1 + e2;
1275     }
1276 }
1277 
1278 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1279 {
1280     intptr_t i, opr_sz = simd_oprsz(desc);
1281     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1282     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1283     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1284 
1285     for (i = 0; i < opr_sz / 8; i += 2) {
1286         Int128 e1 = int128_make64(a[i]);
1287         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1288         Int128 c = int128_make64(m[i + 1] & 1);
1289         Int128 r = int128_add(int128_add(e1, e2), c);
1290         d[i + 0] = int128_getlo(r);
1291         d[i + 1] = int128_gethi(r);
1292     }
1293 }
1294 
1295 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1297 {                                                                       \
1298     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1299     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1300     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1301     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1302         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1303         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1304         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1305         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1306     }                                                                   \
1307 }
1308 
1309 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1310            do_sqdmull_h, DO_SQADD_H)
1311 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1312            do_sqdmull_s, DO_SQADD_S)
1313 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1314            do_sqdmull_d, do_sqadd_d)
1315 
1316 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1317            do_sqdmull_h, DO_SQSUB_H)
1318 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1319            do_sqdmull_s, DO_SQSUB_S)
1320 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1321            do_sqdmull_d, do_sqsub_d)
1322 
1323 #undef DO_SQDMLAL
1324 
1325 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1326 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1327 {                                                               \
1328     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1329     int rot = simd_data(desc);                                  \
1330     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1331     bool sub_r = rot == 1 || rot == 2;                          \
1332     bool sub_i = rot >= 2;                                      \
1333     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1334     for (i = 0; i < opr_sz; i += 2) {                           \
1335         TYPE elt1_a = n[H(i + sel_a)];                          \
1336         TYPE elt2_a = m[H(i + sel_a)];                          \
1337         TYPE elt2_b = m[H(i + sel_b)];                          \
1338         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1339         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1340     }                                                           \
1341 }
1342 
1343 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1344 
1345 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1346 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1347 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1348 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1349 
1350 #define DO_SQRDMLAH_B(N, M, A, S) \
1351     do_sqrdmlah_b(N, M, A, S, true)
1352 #define DO_SQRDMLAH_H(N, M, A, S) \
1353     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1354 #define DO_SQRDMLAH_S(N, M, A, S) \
1355     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1356 #define DO_SQRDMLAH_D(N, M, A, S) \
1357     do_sqrdmlah_d(N, M, A, S, true)
1358 
1359 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1360 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1363 
1364 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1365 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1366 {                                                                           \
1367     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1368     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1369     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1370     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1371     bool sub_r = rot == 1 || rot == 2;                                      \
1372     bool sub_i = rot >= 2;                                                  \
1373     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1374     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1375         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1376         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1377         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1378             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1379             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1380             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1381         }                                                                   \
1382     }                                                                       \
1383 }
1384 
1385 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1386 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1387 
1388 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1389 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1390 
1391 #undef DO_CMLA
1392 #undef DO_CMLA_FUNC
1393 #undef DO_CMLA_IDX_FUNC
1394 #undef DO_SQRDMLAH_B
1395 #undef DO_SQRDMLAH_H
1396 #undef DO_SQRDMLAH_S
1397 #undef DO_SQRDMLAH_D
1398 
1399 /* Note N and M are 4 elements bundled into one unit. */
1400 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1401                          int sel_a, int sel_b, int sub_i)
1402 {
1403     for (int i = 0; i <= 1; i++) {
1404         int32_t elt1_r = (int8_t)(n >> (16 * i));
1405         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1406         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1407         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1408 
1409         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1410     }
1411     return a;
1412 }
1413 
1414 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1415                          int sel_a, int sel_b, int sub_i)
1416 {
1417     for (int i = 0; i <= 1; i++) {
1418         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1419         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1420         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1421         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1422 
1423         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1424     }
1425     return a;
1426 }
1427 
1428 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1429                               void *va, uint32_t desc)
1430 {
1431     int opr_sz = simd_oprsz(desc);
1432     int rot = simd_data(desc);
1433     int sel_a = rot & 1;
1434     int sel_b = sel_a ^ 1;
1435     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1436     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1437 
1438     for (int e = 0; e < opr_sz / 4; e++) {
1439         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1440     }
1441 }
1442 
1443 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1444                               void *va, uint32_t desc)
1445 {
1446     int opr_sz = simd_oprsz(desc);
1447     int rot = simd_data(desc);
1448     int sel_a = rot & 1;
1449     int sel_b = sel_a ^ 1;
1450     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1451     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1452 
1453     for (int e = 0; e < opr_sz / 8; e++) {
1454         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1455     }
1456 }
1457 
1458 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1459                              void *va, uint32_t desc)
1460 {
1461     int opr_sz = simd_oprsz(desc);
1462     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1463     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1464     int sel_a = rot & 1;
1465     int sel_b = sel_a ^ 1;
1466     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1467     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1468 
1469     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1470         uint32_t seg_m = m[seg + idx];
1471         for (int e = 0; e < 4; e++) {
1472             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1473                                    sel_a, sel_b, sub_i);
1474         }
1475     }
1476 }
1477 
1478 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1479                              void *va, uint32_t desc)
1480 {
1481     int seg, opr_sz = simd_oprsz(desc);
1482     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1483     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1484     int sel_a = rot & 1;
1485     int sel_b = sel_a ^ 1;
1486     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1487     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1488 
1489     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1490         uint64_t seg_m = m[seg + idx];
1491         for (int e = 0; e < 2; e++) {
1492             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1493                                    sel_a, sel_b, sub_i);
1494         }
1495     }
1496 }
1497 
1498 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1499 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1500 {                                                                       \
1501     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1502     intptr_t i, j, idx = simd_data(desc);                               \
1503     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1504     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1505         TYPE mm = m[i];                                                 \
1506         for (j = 0; j < segment; j++) {                                 \
1507             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1508         }                                                               \
1509     }                                                                   \
1510 }
1511 
1512 #define DO_SQRDMLAH_H(N, M, A) \
1513     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1514 #define DO_SQRDMLAH_S(N, M, A) \
1515     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1516 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1517 
1518 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1519 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1520 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1521 
1522 #define DO_SQRDMLSH_H(N, M, A) \
1523     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1524 #define DO_SQRDMLSH_S(N, M, A) \
1525     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1526 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1527 
1528 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1529 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1530 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1531 
1532 #undef DO_ZZXZ
1533 
1534 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1535 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1536 {                                                                         \
1537     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1538     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1539     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1540     for (i = 0; i < oprsz; i += 16) {                                     \
1541         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1542         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1543             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1544             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1545             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1546         }                                                                 \
1547     }                                                                     \
1548 }
1549 
1550 #define DO_MLA(N, M, A)  (A + N * M)
1551 
1552 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1553 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1554 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1555 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1556 
1557 #define DO_MLS(N, M, A)  (A - N * M)
1558 
1559 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1560 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1561 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1562 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1563 
1564 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1565 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1566 
1567 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1568 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1569 
1570 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1571 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1572 
1573 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1574 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1575 
1576 #undef DO_MLA
1577 #undef DO_MLS
1578 #undef DO_ZZXW
1579 
1580 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1581 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1582 {                                                                         \
1583     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1584     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1585     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1586     for (i = 0; i < oprsz; i += 16) {                                     \
1587         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1588         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1589             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1590             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1591         }                                                                 \
1592     }                                                                     \
1593 }
1594 
1595 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1596 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1597 
1598 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1599 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1600 
1601 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1602 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1603 
1604 #undef DO_ZZX
1605 
1606 #define DO_BITPERM(NAME, TYPE, OP) \
1607 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1608 {                                                              \
1609     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1610     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1611         TYPE nn = *(TYPE *)(vn + i);                           \
1612         TYPE mm = *(TYPE *)(vm + i);                           \
1613         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1614     }                                                          \
1615 }
1616 
1617 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1618 {
1619     uint64_t res = 0;
1620     int db, rb = 0;
1621 
1622     for (db = 0; db < n; ++db) {
1623         if ((mask >> db) & 1) {
1624             res |= ((data >> db) & 1) << rb;
1625             ++rb;
1626         }
1627     }
1628     return res;
1629 }
1630 
1631 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1632 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1633 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1634 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1635 
1636 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1637 {
1638     uint64_t res = 0;
1639     int rb, db = 0;
1640 
1641     for (rb = 0; rb < n; ++rb) {
1642         if ((mask >> rb) & 1) {
1643             res |= ((data >> db) & 1) << rb;
1644             ++db;
1645         }
1646     }
1647     return res;
1648 }
1649 
1650 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1651 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1652 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1653 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1654 
1655 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1656 {
1657     uint64_t resm = 0, resu = 0;
1658     int db, rbm = 0, rbu = 0;
1659 
1660     for (db = 0; db < n; ++db) {
1661         uint64_t val = (data >> db) & 1;
1662         if ((mask >> db) & 1) {
1663             resm |= val << rbm++;
1664         } else {
1665             resu |= val << rbu++;
1666         }
1667     }
1668 
1669     return resm | (resu << rbm);
1670 }
1671 
1672 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1673 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1674 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1675 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1676 
1677 #undef DO_BITPERM
1678 
1679 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1680 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1681 {                                                               \
1682     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1683     int sub_r = simd_data(desc);                                \
1684     if (sub_r) {                                                \
1685         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1686             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1687             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1688             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1689             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1690             acc_r = ADD_OP(acc_r, el2_i);                       \
1691             acc_i = SUB_OP(acc_i, el2_r);                       \
1692             *(TYPE *)(vd + H(i)) = acc_r;                       \
1693             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1694         }                                                       \
1695     } else {                                                    \
1696         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1697             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1698             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1699             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1700             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1701             acc_r = SUB_OP(acc_r, el2_i);                       \
1702             acc_i = ADD_OP(acc_i, el2_r);                       \
1703             *(TYPE *)(vd + H(i)) = acc_r;                       \
1704             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1705         }                                                       \
1706     }                                                           \
1707 }
1708 
1709 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1710 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1711 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1712 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1713 
1714 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1715 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1716 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1717 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1718 
1719 #undef DO_CADD
1720 
1721 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1722 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1723 {                                                              \
1724     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1725     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1726     int shift = simd_data(desc) >> 1;                          \
1727     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1728         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1729         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1730     }                                                          \
1731 }
1732 
1733 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1734 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1735 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1736 
1737 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1738 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1739 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1740 
1741 #undef DO_ZZI_SHLL
1742 
1743 /* Two-operand reduction expander, controlled by a predicate.
1744  * The difference between TYPERED and TYPERET has to do with
1745  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1746  * but TYPERET must be unsigned so that e.g. a 32-bit value
1747  * is not sign-extended to the ABI uint64_t return type.
1748  */
1749 /* ??? If we were to vectorize this by hand the reduction ordering
1750  * would change.  For integer operands, this is perfectly fine.
1751  */
1752 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1753 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1754 {                                                          \
1755     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1756     TYPERED ret = INIT;                                    \
1757     for (i = 0; i < opr_sz; ) {                            \
1758         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1759         do {                                               \
1760             if (pg & 1) {                                  \
1761                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1762                 ret = OP(ret, nn);                         \
1763             }                                              \
1764             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1765         } while (i & 15);                                  \
1766     }                                                      \
1767     return (TYPERET)ret;                                   \
1768 }
1769 
1770 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1771 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1772 {                                                          \
1773     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1774     TYPEE *n = vn;                                         \
1775     uint8_t *pg = vg;                                      \
1776     TYPER ret = INIT;                                      \
1777     for (i = 0; i < opr_sz; i += 1) {                      \
1778         if (pg[H1(i)] & 1) {                               \
1779             TYPEE nn = n[i];                               \
1780             ret = OP(ret, nn);                             \
1781         }                                                  \
1782     }                                                      \
1783     return ret;                                            \
1784 }
1785 
1786 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1787 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1788 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1789 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1790 
1791 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1792 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1793 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1794 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1795 
1796 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1797 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1798 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1799 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1800 
1801 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1802 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1803 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1804 
1805 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1806 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1807 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1808 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1809 
1810 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1811 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1812 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1813 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1814 
1815 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1816 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1817 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1818 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1819 
1820 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1821 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1822 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1823 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1824 
1825 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1826 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1827 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1828 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1829 
1830 #undef DO_VPZ
1831 #undef DO_VPZ_D
1832 
1833 /* Two vector operand, one scalar operand, unpredicated.  */
1834 #define DO_ZZI(NAME, TYPE, OP)                                       \
1835 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1836 {                                                                    \
1837     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1838     TYPE s = s64, *d = vd, *n = vn;                                  \
1839     for (i = 0; i < opr_sz; ++i) {                                   \
1840         d[i] = OP(n[i], s);                                          \
1841     }                                                                \
1842 }
1843 
1844 #define DO_SUBR(X, Y)   (Y - X)
1845 
1846 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1847 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1848 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1849 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1850 
1851 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1852 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1853 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1854 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1855 
1856 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1857 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1858 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1859 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1860 
1861 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1862 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1863 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1864 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1865 
1866 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1867 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1868 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1869 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1870 
1871 #undef DO_ZZI
1872 
1873 #undef DO_AND
1874 #undef DO_ORR
1875 #undef DO_EOR
1876 #undef DO_BIC
1877 #undef DO_ADD
1878 #undef DO_SUB
1879 #undef DO_MAX
1880 #undef DO_MIN
1881 #undef DO_ABD
1882 #undef DO_MUL
1883 #undef DO_DIV
1884 #undef DO_ASR
1885 #undef DO_LSR
1886 #undef DO_LSL
1887 #undef DO_SUBR
1888 
1889 /* Similar to the ARM LastActiveElement pseudocode function, except the
1890    result is multiplied by the element size.  This includes the not found
1891    indication; e.g. not found for esz=3 is -8.  */
1892 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1893 {
1894     uint64_t mask = pred_esz_masks[esz];
1895     intptr_t i = words;
1896 
1897     do {
1898         uint64_t this_g = g[--i] & mask;
1899         if (this_g) {
1900             return i * 64 + (63 - clz64(this_g));
1901         }
1902     } while (i > 0);
1903     return (intptr_t)-1 << esz;
1904 }
1905 
1906 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1907 {
1908     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1909     uint32_t flags = PREDTEST_INIT;
1910     uint64_t *d = vd, *g = vg;
1911     intptr_t i = 0;
1912 
1913     do {
1914         uint64_t this_d = d[i];
1915         uint64_t this_g = g[i];
1916 
1917         if (this_g) {
1918             if (!(flags & 4)) {
1919                 /* Set in D the first bit of G.  */
1920                 this_d |= this_g & -this_g;
1921                 d[i] = this_d;
1922             }
1923             flags = iter_predtest_fwd(this_d, this_g, flags);
1924         }
1925     } while (++i < words);
1926 
1927     return flags;
1928 }
1929 
1930 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1931 {
1932     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1933     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1934     uint32_t flags = PREDTEST_INIT;
1935     uint64_t *d = vd, *g = vg, esz_mask;
1936     intptr_t i, next;
1937 
1938     next = last_active_element(vd, words, esz) + (1 << esz);
1939     esz_mask = pred_esz_masks[esz];
1940 
1941     /* Similar to the pseudocode for pnext, but scaled by ESZ
1942        so that we find the correct bit.  */
1943     if (next < words * 64) {
1944         uint64_t mask = -1;
1945 
1946         if (next & 63) {
1947             mask = ~((1ull << (next & 63)) - 1);
1948             next &= -64;
1949         }
1950         do {
1951             uint64_t this_g = g[next / 64] & esz_mask & mask;
1952             if (this_g != 0) {
1953                 next = (next & -64) + ctz64(this_g);
1954                 break;
1955             }
1956             next += 64;
1957             mask = -1;
1958         } while (next < words * 64);
1959     }
1960 
1961     i = 0;
1962     do {
1963         uint64_t this_d = 0;
1964         if (i == next / 64) {
1965             this_d = 1ull << (next & 63);
1966         }
1967         d[i] = this_d;
1968         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1969     } while (++i < words);
1970 
1971     return flags;
1972 }
1973 
1974 /*
1975  * Copy Zn into Zd, and store zero into inactive elements.
1976  * If inv, store zeros into the active elements.
1977  */
1978 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1979 {
1980     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1981     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1982     uint64_t *d = vd, *n = vn;
1983     uint8_t *pg = vg;
1984 
1985     for (i = 0; i < opr_sz; i += 1) {
1986         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1987     }
1988 }
1989 
1990 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1991 {
1992     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1993     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1994     uint64_t *d = vd, *n = vn;
1995     uint8_t *pg = vg;
1996 
1997     for (i = 0; i < opr_sz; i += 1) {
1998         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1999     }
2000 }
2001 
2002 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2003 {
2004     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2005     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2006     uint64_t *d = vd, *n = vn;
2007     uint8_t *pg = vg;
2008 
2009     for (i = 0; i < opr_sz; i += 1) {
2010         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2011     }
2012 }
2013 
2014 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2015 {
2016     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2017     uint64_t *d = vd, *n = vn;
2018     uint8_t *pg = vg;
2019     uint8_t inv = simd_data(desc);
2020 
2021     for (i = 0; i < opr_sz; i += 1) {
2022         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2023     }
2024 }
2025 
2026 /* Three-operand expander, immediate operand, controlled by a predicate.
2027  */
2028 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2029 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2030 {                                                               \
2031     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2032     TYPE imm = simd_data(desc);                                 \
2033     for (i = 0; i < opr_sz; ) {                                 \
2034         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2035         do {                                                    \
2036             if (pg & 1) {                                       \
2037                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2038                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2039             }                                                   \
2040             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2041         } while (i & 15);                                       \
2042     }                                                           \
2043 }
2044 
2045 /* Similarly, specialized for 64-bit operands.  */
2046 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2047 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2048 {                                                               \
2049     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2050     TYPE *d = vd, *n = vn;                                      \
2051     TYPE imm = simd_data(desc);                                 \
2052     uint8_t *pg = vg;                                           \
2053     for (i = 0; i < opr_sz; i += 1) {                           \
2054         if (pg[H1(i)] & 1) {                                    \
2055             TYPE nn = n[i];                                     \
2056             d[i] = OP(nn, imm);                                 \
2057         }                                                       \
2058     }                                                           \
2059 }
2060 
2061 #define DO_SHR(N, M)  (N >> M)
2062 #define DO_SHL(N, M)  (N << M)
2063 
2064 /* Arithmetic shift right for division.  This rounds negative numbers
2065    toward zero as per signed division.  Therefore before shifting,
2066    when N is negative, add 2**M-1.  */
2067 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2068 
2069 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2070 {
2071     if (likely(sh < 64)) {
2072         return (x >> sh) + ((x >> (sh - 1)) & 1);
2073     } else if (sh == 64) {
2074         return x >> 63;
2075     } else {
2076         return 0;
2077     }
2078 }
2079 
2080 static inline int64_t do_srshr(int64_t x, unsigned sh)
2081 {
2082     if (likely(sh < 64)) {
2083         return (x >> sh) + ((x >> (sh - 1)) & 1);
2084     } else {
2085         /* Rounding the sign bit always produces 0. */
2086         return 0;
2087     }
2088 }
2089 
2090 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2091 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2092 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2093 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2094 
2095 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2096 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2097 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2098 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2099 
2100 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2101 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2102 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2103 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2104 
2105 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2106 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2107 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2108 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2109 
2110 /* SVE2 bitwise shift by immediate */
2111 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2112 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2113 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2114 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2115 
2116 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2117 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2118 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2119 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2120 
2121 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2122 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2123 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2124 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2125 
2126 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2127 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2128 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2129 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2130 
2131 #define do_suqrshl_b(n, m) \
2132    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2133 #define do_suqrshl_h(n, m) \
2134    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2135 #define do_suqrshl_s(n, m) \
2136    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2137 #define do_suqrshl_d(n, m) \
2138    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2139 
2140 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2141 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2142 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2143 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2144 
2145 #undef DO_ASRD
2146 #undef DO_ZPZI
2147 #undef DO_ZPZI_D
2148 
2149 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2150 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2151 {                                                            \
2152     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2153     int shift = simd_data(desc);                             \
2154     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2155         TYPEW nn = *(TYPEW *)(vn + i);                       \
2156         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2157     }                                                        \
2158 }
2159 
2160 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2161 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2162 {                                                                 \
2163     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2164     int shift = simd_data(desc);                                  \
2165     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2166         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2167         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2168     }                                                             \
2169 }
2170 
2171 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2172 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2173 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2174 
2175 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2176 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2177 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2178 
2179 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2180 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2181 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2182 
2183 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2184 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2185 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2186 
2187 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2188 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2189 #define DO_SQSHRUN_D(x, sh) \
2190     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2191 
2192 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2193 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2194 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2195 
2196 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2197 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2198 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2199 
2200 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2201 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2202 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2203 
2204 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2205 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2206 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2207 
2208 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2209 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2210 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2211 
2212 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2213 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2214 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2215 
2216 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2217 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2218 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2219 
2220 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2221 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2222 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2223 
2224 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2225 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2226 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2227 
2228 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2229 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2230 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2231 
2232 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2233 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2234 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2235 
2236 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2237 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2238 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2239 
2240 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2241 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2242 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2243 
2244 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2245 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2246 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2247 
2248 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2249 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2250 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2251 
2252 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2253 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2254 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2255 
2256 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2257 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2258 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2259 
2260 #undef DO_SHRNB
2261 #undef DO_SHRNT
2262 
2263 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2264 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2265 {                                                                           \
2266     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2267     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2268         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2269         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2270         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2271     }                                                                       \
2272 }
2273 
2274 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2275 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2276 {                                                                           \
2277     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2278     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2279         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2280         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2281         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2282     }                                                                       \
2283 }
2284 
2285 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2286 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2287 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2288 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2289 
2290 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2291 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2292 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2293 
2294 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2295 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2296 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2297 
2298 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2299 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2300 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2301 
2302 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2303 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2304 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2305 
2306 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2307 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2308 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2309 
2310 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2311 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2312 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2313 
2314 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2315 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2316 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2317 
2318 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2319 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2320 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2321 
2322 #undef DO_RSUBHN
2323 #undef DO_SUBHN
2324 #undef DO_RADDHN
2325 #undef DO_ADDHN
2326 
2327 #undef DO_BINOPNB
2328 
2329 /* Fully general four-operand expander, controlled by a predicate.
2330  */
2331 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2332 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2333                   void *vg, uint32_t desc)                    \
2334 {                                                             \
2335     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2336     for (i = 0; i < opr_sz; ) {                               \
2337         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2338         do {                                                  \
2339             if (pg & 1) {                                     \
2340                 TYPE nn = *(TYPE *)(vn + H(i));               \
2341                 TYPE mm = *(TYPE *)(vm + H(i));               \
2342                 TYPE aa = *(TYPE *)(va + H(i));               \
2343                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2344             }                                                 \
2345             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2346         } while (i & 15);                                     \
2347     }                                                         \
2348 }
2349 
2350 /* Similarly, specialized for 64-bit operands.  */
2351 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2352 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2353                   void *vg, uint32_t desc)                    \
2354 {                                                             \
2355     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2356     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2357     uint8_t *pg = vg;                                         \
2358     for (i = 0; i < opr_sz; i += 1) {                         \
2359         if (pg[H1(i)] & 1) {                                  \
2360             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2361             d[i] = OP(aa, nn, mm);                            \
2362         }                                                     \
2363     }                                                         \
2364 }
2365 
2366 #define DO_MLA(A, N, M)  (A + N * M)
2367 #define DO_MLS(A, N, M)  (A - N * M)
2368 
2369 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2370 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2371 
2372 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2373 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2374 
2375 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2376 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2377 
2378 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2379 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2380 
2381 #undef DO_MLA
2382 #undef DO_MLS
2383 #undef DO_ZPZZZ
2384 #undef DO_ZPZZZ_D
2385 
2386 void HELPER(sve_index_b)(void *vd, uint32_t start,
2387                          uint32_t incr, uint32_t desc)
2388 {
2389     intptr_t i, opr_sz = simd_oprsz(desc);
2390     uint8_t *d = vd;
2391     for (i = 0; i < opr_sz; i += 1) {
2392         d[H1(i)] = start + i * incr;
2393     }
2394 }
2395 
2396 void HELPER(sve_index_h)(void *vd, uint32_t start,
2397                          uint32_t incr, uint32_t desc)
2398 {
2399     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2400     uint16_t *d = vd;
2401     for (i = 0; i < opr_sz; i += 1) {
2402         d[H2(i)] = start + i * incr;
2403     }
2404 }
2405 
2406 void HELPER(sve_index_s)(void *vd, uint32_t start,
2407                          uint32_t incr, uint32_t desc)
2408 {
2409     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2410     uint32_t *d = vd;
2411     for (i = 0; i < opr_sz; i += 1) {
2412         d[H4(i)] = start + i * incr;
2413     }
2414 }
2415 
2416 void HELPER(sve_index_d)(void *vd, uint64_t start,
2417                          uint64_t incr, uint32_t desc)
2418 {
2419     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2420     uint64_t *d = vd;
2421     for (i = 0; i < opr_sz; i += 1) {
2422         d[i] = start + i * incr;
2423     }
2424 }
2425 
2426 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2427 {
2428     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2429     uint32_t sh = simd_data(desc);
2430     uint32_t *d = vd, *n = vn, *m = vm;
2431     for (i = 0; i < opr_sz; i += 1) {
2432         d[i] = n[i] + (m[i] << sh);
2433     }
2434 }
2435 
2436 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2437 {
2438     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2439     uint64_t sh = simd_data(desc);
2440     uint64_t *d = vd, *n = vn, *m = vm;
2441     for (i = 0; i < opr_sz; i += 1) {
2442         d[i] = n[i] + (m[i] << sh);
2443     }
2444 }
2445 
2446 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2447 {
2448     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2449     uint64_t sh = simd_data(desc);
2450     uint64_t *d = vd, *n = vn, *m = vm;
2451     for (i = 0; i < opr_sz; i += 1) {
2452         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2453     }
2454 }
2455 
2456 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2457 {
2458     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2459     uint64_t sh = simd_data(desc);
2460     uint64_t *d = vd, *n = vn, *m = vm;
2461     for (i = 0; i < opr_sz; i += 1) {
2462         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2463     }
2464 }
2465 
2466 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2467 {
2468     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2469     static const uint16_t coeff[] = {
2470         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2471         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2472         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2473         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2474     };
2475     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2476     uint16_t *d = vd, *n = vn;
2477 
2478     for (i = 0; i < opr_sz; i++) {
2479         uint16_t nn = n[i];
2480         intptr_t idx = extract32(nn, 0, 5);
2481         uint16_t exp = extract32(nn, 5, 5);
2482         d[i] = coeff[idx] | (exp << 10);
2483     }
2484 }
2485 
2486 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2487 {
2488     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2489     static const uint32_t coeff[] = {
2490         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2491         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2492         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2493         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2494         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2495         0x1ef532, 0x20b051, 0x227043, 0x243516,
2496         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2497         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2498         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2499         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2500         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2501         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2502         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2503         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2504         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2505         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2506     };
2507     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2508     uint32_t *d = vd, *n = vn;
2509 
2510     for (i = 0; i < opr_sz; i++) {
2511         uint32_t nn = n[i];
2512         intptr_t idx = extract32(nn, 0, 6);
2513         uint32_t exp = extract32(nn, 6, 8);
2514         d[i] = coeff[idx] | (exp << 23);
2515     }
2516 }
2517 
2518 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2519 {
2520     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2521     static const uint64_t coeff[] = {
2522         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2523         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2524         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2525         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2526         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2527         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2528         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2529         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2530         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2531         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2532         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2533         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2534         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2535         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2536         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2537         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2538         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2539         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2540         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2541         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2542         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2543         0xFA7C1819E90D8ull,
2544     };
2545     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2546     uint64_t *d = vd, *n = vn;
2547 
2548     for (i = 0; i < opr_sz; i++) {
2549         uint64_t nn = n[i];
2550         intptr_t idx = extract32(nn, 0, 6);
2551         uint64_t exp = extract32(nn, 6, 11);
2552         d[i] = coeff[idx] | (exp << 52);
2553     }
2554 }
2555 
2556 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2557 {
2558     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2559     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2560     uint16_t *d = vd, *n = vn, *m = vm;
2561     for (i = 0; i < opr_sz; i += 1) {
2562         uint16_t nn = n[i];
2563         uint16_t mm = m[i];
2564         if (mm & 1) {
2565             nn = float16_one;
2566         }
2567         if (mm & 2) {
2568             nn = float16_maybe_ah_chs(nn, fpcr_ah);
2569         }
2570         d[i] = nn;
2571     }
2572 }
2573 
2574 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2575 {
2576     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2577     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2578     uint32_t *d = vd, *n = vn, *m = vm;
2579     for (i = 0; i < opr_sz; i += 1) {
2580         uint32_t nn = n[i];
2581         uint32_t mm = m[i];
2582         if (mm & 1) {
2583             nn = float32_one;
2584         }
2585         if (mm & 2) {
2586             nn = float32_maybe_ah_chs(nn, fpcr_ah);
2587         }
2588         d[i] = nn;
2589     }
2590 }
2591 
2592 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2593 {
2594     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2595     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2596     uint64_t *d = vd, *n = vn, *m = vm;
2597     for (i = 0; i < opr_sz; i += 1) {
2598         uint64_t nn = n[i];
2599         uint64_t mm = m[i];
2600         if (mm & 1) {
2601             nn = float64_one;
2602         }
2603         if (mm & 2) {
2604             nn = float64_maybe_ah_chs(nn, fpcr_ah);
2605         }
2606         d[i] = nn;
2607     }
2608 }
2609 
2610 /*
2611  * Signed saturating addition with scalar operand.
2612  */
2613 
2614 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2615 {
2616     intptr_t i, oprsz = simd_oprsz(desc);
2617 
2618     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2619         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2620     }
2621 }
2622 
2623 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2624 {
2625     intptr_t i, oprsz = simd_oprsz(desc);
2626 
2627     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2628         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2629     }
2630 }
2631 
2632 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2633 {
2634     intptr_t i, oprsz = simd_oprsz(desc);
2635 
2636     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2637         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2638     }
2639 }
2640 
2641 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2642 {
2643     intptr_t i, oprsz = simd_oprsz(desc);
2644 
2645     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2646         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2647     }
2648 }
2649 
2650 /*
2651  * Unsigned saturating addition with scalar operand.
2652  */
2653 
2654 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2655 {
2656     intptr_t i, oprsz = simd_oprsz(desc);
2657 
2658     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2659         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2660     }
2661 }
2662 
2663 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2664 {
2665     intptr_t i, oprsz = simd_oprsz(desc);
2666 
2667     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2668         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2669     }
2670 }
2671 
2672 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2673 {
2674     intptr_t i, oprsz = simd_oprsz(desc);
2675 
2676     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2677         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2678     }
2679 }
2680 
2681 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2682 {
2683     intptr_t i, oprsz = simd_oprsz(desc);
2684 
2685     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2686         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2687     }
2688 }
2689 
2690 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2691 {
2692     intptr_t i, oprsz = simd_oprsz(desc);
2693 
2694     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2695         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2696     }
2697 }
2698 
2699 /* Two operand predicated copy immediate with merge.  All valid immediates
2700  * can fit within 17 signed bits in the simd_data field.
2701  */
2702 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2703                          uint64_t mm, uint32_t desc)
2704 {
2705     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2706     uint64_t *d = vd, *n = vn;
2707     uint8_t *pg = vg;
2708 
2709     mm = dup_const(MO_8, mm);
2710     for (i = 0; i < opr_sz; i += 1) {
2711         uint64_t nn = n[i];
2712         uint64_t pp = expand_pred_b(pg[H1(i)]);
2713         d[i] = (mm & pp) | (nn & ~pp);
2714     }
2715 }
2716 
2717 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2718                          uint64_t mm, uint32_t desc)
2719 {
2720     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2721     uint64_t *d = vd, *n = vn;
2722     uint8_t *pg = vg;
2723 
2724     mm = dup_const(MO_16, mm);
2725     for (i = 0; i < opr_sz; i += 1) {
2726         uint64_t nn = n[i];
2727         uint64_t pp = expand_pred_h(pg[H1(i)]);
2728         d[i] = (mm & pp) | (nn & ~pp);
2729     }
2730 }
2731 
2732 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2733                          uint64_t mm, uint32_t desc)
2734 {
2735     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2736     uint64_t *d = vd, *n = vn;
2737     uint8_t *pg = vg;
2738 
2739     mm = dup_const(MO_32, mm);
2740     for (i = 0; i < opr_sz; i += 1) {
2741         uint64_t nn = n[i];
2742         uint64_t pp = expand_pred_s(pg[H1(i)]);
2743         d[i] = (mm & pp) | (nn & ~pp);
2744     }
2745 }
2746 
2747 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2748                          uint64_t mm, uint32_t desc)
2749 {
2750     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2751     uint64_t *d = vd, *n = vn;
2752     uint8_t *pg = vg;
2753 
2754     for (i = 0; i < opr_sz; i += 1) {
2755         uint64_t nn = n[i];
2756         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2757     }
2758 }
2759 
2760 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2761 {
2762     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2763     uint64_t *d = vd;
2764     uint8_t *pg = vg;
2765 
2766     val = dup_const(MO_8, val);
2767     for (i = 0; i < opr_sz; i += 1) {
2768         d[i] = val & expand_pred_b(pg[H1(i)]);
2769     }
2770 }
2771 
2772 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2773 {
2774     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2775     uint64_t *d = vd;
2776     uint8_t *pg = vg;
2777 
2778     val = dup_const(MO_16, val);
2779     for (i = 0; i < opr_sz; i += 1) {
2780         d[i] = val & expand_pred_h(pg[H1(i)]);
2781     }
2782 }
2783 
2784 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2785 {
2786     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2787     uint64_t *d = vd;
2788     uint8_t *pg = vg;
2789 
2790     val = dup_const(MO_32, val);
2791     for (i = 0; i < opr_sz; i += 1) {
2792         d[i] = val & expand_pred_s(pg[H1(i)]);
2793     }
2794 }
2795 
2796 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2797 {
2798     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2799     uint64_t *d = vd;
2800     uint8_t *pg = vg;
2801 
2802     for (i = 0; i < opr_sz; i += 1) {
2803         d[i] = (pg[H1(i)] & 1 ? val : 0);
2804     }
2805 }
2806 
2807 /* Big-endian hosts need to frob the byte indices.  If the copy
2808  * happens to be 8-byte aligned, then no frobbing necessary.
2809  */
2810 static void swap_memmove(void *vd, void *vs, size_t n)
2811 {
2812     uintptr_t d = (uintptr_t)vd;
2813     uintptr_t s = (uintptr_t)vs;
2814     uintptr_t o = (d | s | n) & 7;
2815     size_t i;
2816 
2817 #if !HOST_BIG_ENDIAN
2818     o = 0;
2819 #endif
2820     switch (o) {
2821     case 0:
2822         memmove(vd, vs, n);
2823         break;
2824 
2825     case 4:
2826         if (d < s || d >= s + n) {
2827             for (i = 0; i < n; i += 4) {
2828                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2829             }
2830         } else {
2831             for (i = n; i > 0; ) {
2832                 i -= 4;
2833                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2834             }
2835         }
2836         break;
2837 
2838     case 2:
2839     case 6:
2840         if (d < s || d >= s + n) {
2841             for (i = 0; i < n; i += 2) {
2842                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2843             }
2844         } else {
2845             for (i = n; i > 0; ) {
2846                 i -= 2;
2847                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2848             }
2849         }
2850         break;
2851 
2852     default:
2853         if (d < s || d >= s + n) {
2854             for (i = 0; i < n; i++) {
2855                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2856             }
2857         } else {
2858             for (i = n; i > 0; ) {
2859                 i -= 1;
2860                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2861             }
2862         }
2863         break;
2864     }
2865 }
2866 
2867 /* Similarly for memset of 0.  */
2868 static void swap_memzero(void *vd, size_t n)
2869 {
2870     uintptr_t d = (uintptr_t)vd;
2871     uintptr_t o = (d | n) & 7;
2872     size_t i;
2873 
2874     /* Usually, the first bit of a predicate is set, so N is 0.  */
2875     if (likely(n == 0)) {
2876         return;
2877     }
2878 
2879 #if !HOST_BIG_ENDIAN
2880     o = 0;
2881 #endif
2882     switch (o) {
2883     case 0:
2884         memset(vd, 0, n);
2885         break;
2886 
2887     case 4:
2888         for (i = 0; i < n; i += 4) {
2889             *(uint32_t *)H1_4(d + i) = 0;
2890         }
2891         break;
2892 
2893     case 2:
2894     case 6:
2895         for (i = 0; i < n; i += 2) {
2896             *(uint16_t *)H1_2(d + i) = 0;
2897         }
2898         break;
2899 
2900     default:
2901         for (i = 0; i < n; i++) {
2902             *(uint8_t *)H1(d + i) = 0;
2903         }
2904         break;
2905     }
2906 }
2907 
2908 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2909 {
2910     intptr_t opr_sz = simd_oprsz(desc);
2911     size_t n_ofs = simd_data(desc);
2912     size_t n_siz = opr_sz - n_ofs;
2913 
2914     if (vd != vm) {
2915         swap_memmove(vd, vn + n_ofs, n_siz);
2916         swap_memmove(vd + n_siz, vm, n_ofs);
2917     } else if (vd != vn) {
2918         swap_memmove(vd + n_siz, vd, n_ofs);
2919         swap_memmove(vd, vn + n_ofs, n_siz);
2920     } else {
2921         /* vd == vn == vm.  Need temp space.  */
2922         ARMVectorReg tmp;
2923         swap_memmove(&tmp, vm, n_ofs);
2924         swap_memmove(vd, vd + n_ofs, n_siz);
2925         memcpy(vd + n_siz, &tmp, n_ofs);
2926     }
2927 }
2928 
2929 #define DO_INSR(NAME, TYPE, H) \
2930 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2931 {                                                                  \
2932     intptr_t opr_sz = simd_oprsz(desc);                            \
2933     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2934     *(TYPE *)(vd + H(0)) = val;                                    \
2935 }
2936 
2937 DO_INSR(sve_insr_b, uint8_t, H1)
2938 DO_INSR(sve_insr_h, uint16_t, H1_2)
2939 DO_INSR(sve_insr_s, uint32_t, H1_4)
2940 DO_INSR(sve_insr_d, uint64_t, H1_8)
2941 
2942 #undef DO_INSR
2943 
2944 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2945 {
2946     intptr_t i, j, opr_sz = simd_oprsz(desc);
2947     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2948         uint64_t f = *(uint64_t *)(vn + i);
2949         uint64_t b = *(uint64_t *)(vn + j);
2950         *(uint64_t *)(vd + i) = bswap64(b);
2951         *(uint64_t *)(vd + j) = bswap64(f);
2952     }
2953 }
2954 
2955 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2956 {
2957     intptr_t i, j, opr_sz = simd_oprsz(desc);
2958     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2959         uint64_t f = *(uint64_t *)(vn + i);
2960         uint64_t b = *(uint64_t *)(vn + j);
2961         *(uint64_t *)(vd + i) = hswap64(b);
2962         *(uint64_t *)(vd + j) = hswap64(f);
2963     }
2964 }
2965 
2966 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2967 {
2968     intptr_t i, j, opr_sz = simd_oprsz(desc);
2969     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2970         uint64_t f = *(uint64_t *)(vn + i);
2971         uint64_t b = *(uint64_t *)(vn + j);
2972         *(uint64_t *)(vd + i) = rol64(b, 32);
2973         *(uint64_t *)(vd + j) = rol64(f, 32);
2974     }
2975 }
2976 
2977 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2978 {
2979     intptr_t i, j, opr_sz = simd_oprsz(desc);
2980     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2981         uint64_t f = *(uint64_t *)(vn + i);
2982         uint64_t b = *(uint64_t *)(vn + j);
2983         *(uint64_t *)(vd + i) = b;
2984         *(uint64_t *)(vd + j) = f;
2985     }
2986 }
2987 
2988 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2989 
2990 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2991                            bool is_tbx, tb_impl_fn *fn)
2992 {
2993     ARMVectorReg scratch;
2994     uintptr_t oprsz = simd_oprsz(desc);
2995 
2996     if (unlikely(vd == vn)) {
2997         vn = memcpy(&scratch, vn, oprsz);
2998     }
2999 
3000     fn(vd, vn, NULL, vm, oprsz, is_tbx);
3001 }
3002 
3003 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3004                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3005 {
3006     ARMVectorReg scratch;
3007     uintptr_t oprsz = simd_oprsz(desc);
3008 
3009     if (unlikely(vd == vn0)) {
3010         vn0 = memcpy(&scratch, vn0, oprsz);
3011         if (vd == vn1) {
3012             vn1 = vn0;
3013         }
3014     } else if (unlikely(vd == vn1)) {
3015         vn1 = memcpy(&scratch, vn1, oprsz);
3016     }
3017 
3018     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3019 }
3020 
3021 #define DO_TB(SUFF, TYPE, H)                                            \
3022 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
3023                                 void *vm, uintptr_t oprsz, bool is_tbx) \
3024 {                                                                       \
3025     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
3026     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
3027     for (i = 0; i < nelem; ++i) {                                       \
3028         TYPE index = indexes[H1(i)], val = 0;                           \
3029         if (index < nelem) {                                            \
3030             val = tbl0[H(index)];                                       \
3031         } else {                                                        \
3032             index -= nelem;                                             \
3033             if (tbl1 && index < nelem) {                                \
3034                 val = tbl1[H(index)];                                   \
3035             } else if (is_tbx) {                                        \
3036                 continue;                                               \
3037             }                                                           \
3038         }                                                               \
3039         d[H(i)] = val;                                                  \
3040     }                                                                   \
3041 }                                                                       \
3042 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3043 {                                                                       \
3044     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3045 }                                                                       \
3046 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3047                              void *vm, uint32_t desc)                   \
3048 {                                                                       \
3049     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3050 }                                                                       \
3051 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3052 {                                                                       \
3053     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3054 }
3055 
3056 DO_TB(b, uint8_t, H1)
3057 DO_TB(h, uint16_t, H2)
3058 DO_TB(s, uint32_t, H4)
3059 DO_TB(d, uint64_t, H8)
3060 
3061 #undef DO_TB
3062 
3063 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3064 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3065 {                                                              \
3066     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3067     TYPED *d = vd;                                             \
3068     TYPES *n = vn;                                             \
3069     ARMVectorReg tmp;                                          \
3070     if (unlikely(vn - vd < opr_sz)) {                          \
3071         n = memcpy(&tmp, n, opr_sz / 2);                       \
3072     }                                                          \
3073     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3074         d[HD(i)] = n[HS(i)];                                   \
3075     }                                                          \
3076 }
3077 
3078 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3079 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3080 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3081 
3082 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3083 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3084 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3085 
3086 #undef DO_UNPK
3087 
3088 /* Mask of bits included in the even numbered predicates of width esz.
3089  * We also use this for expand_bits/compress_bits, and so extend the
3090  * same pattern out to 16-bit units.
3091  */
3092 static const uint64_t even_bit_esz_masks[5] = {
3093     0x5555555555555555ull,
3094     0x3333333333333333ull,
3095     0x0f0f0f0f0f0f0f0full,
3096     0x00ff00ff00ff00ffull,
3097     0x0000ffff0000ffffull,
3098 };
3099 
3100 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3101  * For N==0, this corresponds to the operation that in qemu/bitops.h
3102  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3103  * section 7-2 Shuffling Bits.
3104  */
3105 static uint64_t expand_bits(uint64_t x, int n)
3106 {
3107     int i;
3108 
3109     x &= 0xffffffffu;
3110     for (i = 4; i >= n; i--) {
3111         int sh = 1 << i;
3112         x = ((x << sh) | x) & even_bit_esz_masks[i];
3113     }
3114     return x;
3115 }
3116 
3117 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3118  * For N==0, this corresponds to the operation that in qemu/bitops.h
3119  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3120  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3121  */
3122 static uint64_t compress_bits(uint64_t x, int n)
3123 {
3124     int i;
3125 
3126     for (i = n; i <= 4; i++) {
3127         int sh = 1 << i;
3128         x &= even_bit_esz_masks[i];
3129         x = (x >> sh) | x;
3130     }
3131     return x & 0xffffffffu;
3132 }
3133 
3134 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3135 {
3136     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3137     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3138     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3139     int esize = 1 << esz;
3140     uint64_t *d = vd;
3141     intptr_t i;
3142 
3143     if (oprsz <= 8) {
3144         uint64_t nn = *(uint64_t *)vn;
3145         uint64_t mm = *(uint64_t *)vm;
3146         int half = 4 * oprsz;
3147 
3148         nn = extract64(nn, high * half, half);
3149         mm = extract64(mm, high * half, half);
3150         nn = expand_bits(nn, esz);
3151         mm = expand_bits(mm, esz);
3152         d[0] = nn | (mm << esize);
3153     } else {
3154         ARMPredicateReg tmp;
3155 
3156         /* We produce output faster than we consume input.
3157            Therefore we must be mindful of possible overlap.  */
3158         if (vd == vn) {
3159             vn = memcpy(&tmp, vn, oprsz);
3160             if (vd == vm) {
3161                 vm = vn;
3162             }
3163         } else if (vd == vm) {
3164             vm = memcpy(&tmp, vm, oprsz);
3165         }
3166         if (high) {
3167             high = oprsz >> 1;
3168         }
3169 
3170         if ((oprsz & 7) == 0) {
3171             uint32_t *n = vn, *m = vm;
3172             high >>= 2;
3173 
3174             for (i = 0; i < oprsz / 8; i++) {
3175                 uint64_t nn = n[H4(high + i)];
3176                 uint64_t mm = m[H4(high + i)];
3177 
3178                 nn = expand_bits(nn, esz);
3179                 mm = expand_bits(mm, esz);
3180                 d[i] = nn | (mm << esize);
3181             }
3182         } else {
3183             uint8_t *n = vn, *m = vm;
3184             uint16_t *d16 = vd;
3185 
3186             for (i = 0; i < oprsz / 2; i++) {
3187                 uint16_t nn = n[H1(high + i)];
3188                 uint16_t mm = m[H1(high + i)];
3189 
3190                 nn = expand_bits(nn, esz);
3191                 mm = expand_bits(mm, esz);
3192                 d16[H2(i)] = nn | (mm << esize);
3193             }
3194         }
3195     }
3196 }
3197 
3198 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3199 {
3200     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3201     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3202     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3203     uint64_t *d = vd, *n = vn, *m = vm;
3204     uint64_t l, h;
3205     intptr_t i;
3206 
3207     if (oprsz <= 8) {
3208         l = compress_bits(n[0] >> odd, esz);
3209         h = compress_bits(m[0] >> odd, esz);
3210         d[0] = l | (h << (4 * oprsz));
3211     } else {
3212         ARMPredicateReg tmp_m;
3213         intptr_t oprsz_16 = oprsz / 16;
3214 
3215         if ((vm - vd) < (uintptr_t)oprsz) {
3216             m = memcpy(&tmp_m, vm, oprsz);
3217         }
3218 
3219         for (i = 0; i < oprsz_16; i++) {
3220             l = n[2 * i + 0];
3221             h = n[2 * i + 1];
3222             l = compress_bits(l >> odd, esz);
3223             h = compress_bits(h >> odd, esz);
3224             d[i] = l | (h << 32);
3225         }
3226 
3227         /*
3228          * For VL which is not a multiple of 512, the results from M do not
3229          * align nicely with the uint64_t for D.  Put the aligned results
3230          * from M into TMP_M and then copy it into place afterward.
3231          */
3232         if (oprsz & 15) {
3233             int final_shift = (oprsz & 15) * 2;
3234 
3235             l = n[2 * i + 0];
3236             h = n[2 * i + 1];
3237             l = compress_bits(l >> odd, esz);
3238             h = compress_bits(h >> odd, esz);
3239             d[i] = l | (h << final_shift);
3240 
3241             for (i = 0; i < oprsz_16; i++) {
3242                 l = m[2 * i + 0];
3243                 h = m[2 * i + 1];
3244                 l = compress_bits(l >> odd, esz);
3245                 h = compress_bits(h >> odd, esz);
3246                 tmp_m.p[i] = l | (h << 32);
3247             }
3248             l = m[2 * i + 0];
3249             h = m[2 * i + 1];
3250             l = compress_bits(l >> odd, esz);
3251             h = compress_bits(h >> odd, esz);
3252             tmp_m.p[i] = l | (h << final_shift);
3253 
3254             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3255         } else {
3256             for (i = 0; i < oprsz_16; i++) {
3257                 l = m[2 * i + 0];
3258                 h = m[2 * i + 1];
3259                 l = compress_bits(l >> odd, esz);
3260                 h = compress_bits(h >> odd, esz);
3261                 d[oprsz_16 + i] = l | (h << 32);
3262             }
3263         }
3264     }
3265 }
3266 
3267 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3268 {
3269     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3270     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3271     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3272     uint64_t *d = vd, *n = vn, *m = vm;
3273     uint64_t mask;
3274     int shr, shl;
3275     intptr_t i;
3276 
3277     shl = 1 << esz;
3278     shr = 0;
3279     mask = even_bit_esz_masks[esz];
3280     if (odd) {
3281         mask <<= shl;
3282         shr = shl;
3283         shl = 0;
3284     }
3285 
3286     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3287         uint64_t nn = (n[i] & mask) >> shr;
3288         uint64_t mm = (m[i] & mask) << shl;
3289         d[i] = nn + mm;
3290     }
3291 }
3292 
3293 /* Reverse units of 2**N bits.  */
3294 static uint64_t reverse_bits_64(uint64_t x, int n)
3295 {
3296     int i, sh;
3297 
3298     x = bswap64(x);
3299     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3300         uint64_t mask = even_bit_esz_masks[i];
3301         x = ((x & mask) << sh) | ((x >> sh) & mask);
3302     }
3303     return x;
3304 }
3305 
3306 static uint8_t reverse_bits_8(uint8_t x, int n)
3307 {
3308     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3309     int i, sh;
3310 
3311     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3312         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3313     }
3314     return x;
3315 }
3316 
3317 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3318 {
3319     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3320     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3321     intptr_t i, oprsz_2 = oprsz / 2;
3322 
3323     if (oprsz <= 8) {
3324         uint64_t l = *(uint64_t *)vn;
3325         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3326         *(uint64_t *)vd = l;
3327     } else if ((oprsz & 15) == 0) {
3328         for (i = 0; i < oprsz_2; i += 8) {
3329             intptr_t ih = oprsz - 8 - i;
3330             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3331             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3332             *(uint64_t *)(vd + i) = h;
3333             *(uint64_t *)(vd + ih) = l;
3334         }
3335     } else {
3336         for (i = 0; i < oprsz_2; i += 1) {
3337             intptr_t il = H1(i);
3338             intptr_t ih = H1(oprsz - 1 - i);
3339             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3340             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3341             *(uint8_t *)(vd + il) = h;
3342             *(uint8_t *)(vd + ih) = l;
3343         }
3344     }
3345 }
3346 
3347 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3348 {
3349     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3350     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3351     uint64_t *d = vd;
3352     intptr_t i;
3353 
3354     if (oprsz <= 8) {
3355         uint64_t nn = *(uint64_t *)vn;
3356         int half = 4 * oprsz;
3357 
3358         nn = extract64(nn, high * half, half);
3359         nn = expand_bits(nn, 0);
3360         d[0] = nn;
3361     } else {
3362         ARMPredicateReg tmp_n;
3363 
3364         /* We produce output faster than we consume input.
3365            Therefore we must be mindful of possible overlap.  */
3366         if ((vn - vd) < (uintptr_t)oprsz) {
3367             vn = memcpy(&tmp_n, vn, oprsz);
3368         }
3369         if (high) {
3370             high = oprsz >> 1;
3371         }
3372 
3373         if ((oprsz & 7) == 0) {
3374             uint32_t *n = vn;
3375             high >>= 2;
3376 
3377             for (i = 0; i < oprsz / 8; i++) {
3378                 uint64_t nn = n[H4(high + i)];
3379                 d[i] = expand_bits(nn, 0);
3380             }
3381         } else {
3382             uint16_t *d16 = vd;
3383             uint8_t *n = vn;
3384 
3385             for (i = 0; i < oprsz / 2; i++) {
3386                 uint16_t nn = n[H1(high + i)];
3387                 d16[H2(i)] = expand_bits(nn, 0);
3388             }
3389         }
3390     }
3391 }
3392 
3393 #define DO_ZIP(NAME, TYPE, H) \
3394 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3395 {                                                                    \
3396     intptr_t oprsz = simd_oprsz(desc);                               \
3397     intptr_t odd_ofs = simd_data(desc);                              \
3398     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3399     ARMVectorReg tmp_n, tmp_m;                                       \
3400     /* We produce output faster than we consume input.               \
3401        Therefore we must be mindful of possible overlap.  */         \
3402     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3403         vn = memcpy(&tmp_n, vn, oprsz);                              \
3404     }                                                                \
3405     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3406         vm = memcpy(&tmp_m, vm, oprsz);                              \
3407     }                                                                \
3408     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3409         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3410         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3411             *(TYPE *)(vm + odd_ofs + H(i));                          \
3412     }                                                                \
3413     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3414         memset(vd + oprsz - 16, 0, 16);                              \
3415     }                                                                \
3416 }
3417 
3418 DO_ZIP(sve_zip_b, uint8_t, H1)
3419 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3420 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3421 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3422 DO_ZIP(sve2_zip_q, Int128, )
3423 
3424 #define DO_UZP(NAME, TYPE, H) \
3425 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3426 {                                                                      \
3427     intptr_t oprsz = simd_oprsz(desc);                                 \
3428     intptr_t odd_ofs = simd_data(desc);                                \
3429     intptr_t i, p;                                                     \
3430     ARMVectorReg tmp_m;                                                \
3431     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3432         vm = memcpy(&tmp_m, vm, oprsz);                                \
3433     }                                                                  \
3434     i = 0, p = odd_ofs;                                                \
3435     do {                                                               \
3436         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3437         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3438     } while (p < oprsz);                                               \
3439     p -= oprsz;                                                        \
3440     do {                                                               \
3441         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3442         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3443     } while (p < oprsz);                                               \
3444     tcg_debug_assert(i == oprsz);                                      \
3445 }
3446 
3447 DO_UZP(sve_uzp_b, uint8_t, H1)
3448 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3449 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3450 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3451 DO_UZP(sve2_uzp_q, Int128, )
3452 
3453 #define DO_TRN(NAME, TYPE, H) \
3454 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3455 {                                                                      \
3456     intptr_t oprsz = simd_oprsz(desc);                                 \
3457     intptr_t odd_ofs = simd_data(desc);                                \
3458     intptr_t i;                                                        \
3459     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3460         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3461         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3462         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3463         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3464     }                                                                  \
3465     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3466         memset(vd + oprsz - 16, 0, 16);                                \
3467     }                                                                  \
3468 }
3469 
3470 DO_TRN(sve_trn_b, uint8_t, H1)
3471 DO_TRN(sve_trn_h, uint16_t, H1_2)
3472 DO_TRN(sve_trn_s, uint32_t, H1_4)
3473 DO_TRN(sve_trn_d, uint64_t, H1_8)
3474 DO_TRN(sve2_trn_q, Int128, )
3475 
3476 #undef DO_ZIP
3477 #undef DO_UZP
3478 #undef DO_TRN
3479 
3480 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3481 {
3482     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3483     uint32_t *d = vd, *n = vn;
3484     uint8_t *pg = vg;
3485 
3486     for (i = j = 0; i < opr_sz; i++) {
3487         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3488             d[H4(j)] = n[H4(i)];
3489             j++;
3490         }
3491     }
3492     for (; j < opr_sz; j++) {
3493         d[H4(j)] = 0;
3494     }
3495 }
3496 
3497 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3498 {
3499     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3500     uint64_t *d = vd, *n = vn;
3501     uint8_t *pg = vg;
3502 
3503     for (i = j = 0; i < opr_sz; i++) {
3504         if (pg[H1(i)] & 1) {
3505             d[j] = n[i];
3506             j++;
3507         }
3508     }
3509     for (; j < opr_sz; j++) {
3510         d[j] = 0;
3511     }
3512 }
3513 
3514 /* Similar to the ARM LastActiveElement pseudocode function, except the
3515  * result is multiplied by the element size.  This includes the not found
3516  * indication; e.g. not found for esz=3 is -8.
3517  */
3518 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3519 {
3520     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3521     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3522 
3523     return last_active_element(vg, words, esz);
3524 }
3525 
3526 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3527 {
3528     intptr_t opr_sz = simd_oprsz(desc) / 8;
3529     int esz = simd_data(desc);
3530     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3531     intptr_t i, first_i, last_i;
3532     ARMVectorReg tmp;
3533 
3534     first_i = last_i = 0;
3535     first_g = last_g = 0;
3536 
3537     /* Find the extent of the active elements within VG.  */
3538     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3539         pg = *(uint64_t *)(vg + i) & mask;
3540         if (pg) {
3541             if (last_g == 0) {
3542                 last_g = pg;
3543                 last_i = i;
3544             }
3545             first_g = pg;
3546             first_i = i;
3547         }
3548     }
3549 
3550     len = 0;
3551     if (first_g != 0) {
3552         first_i = first_i * 8 + ctz64(first_g);
3553         last_i = last_i * 8 + 63 - clz64(last_g);
3554         len = last_i - first_i + (1 << esz);
3555         if (vd == vm) {
3556             vm = memcpy(&tmp, vm, opr_sz * 8);
3557         }
3558         swap_memmove(vd, vn + first_i, len);
3559     }
3560     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3561 }
3562 
3563 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3564                             void *vg, uint32_t desc)
3565 {
3566     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3567     uint64_t *d = vd, *n = vn, *m = vm;
3568     uint8_t *pg = vg;
3569 
3570     for (i = 0; i < opr_sz; i += 1) {
3571         uint64_t nn = n[i], mm = m[i];
3572         uint64_t pp = expand_pred_b(pg[H1(i)]);
3573         d[i] = (nn & pp) | (mm & ~pp);
3574     }
3575 }
3576 
3577 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3578                             void *vg, uint32_t desc)
3579 {
3580     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3581     uint64_t *d = vd, *n = vn, *m = vm;
3582     uint8_t *pg = vg;
3583 
3584     for (i = 0; i < opr_sz; i += 1) {
3585         uint64_t nn = n[i], mm = m[i];
3586         uint64_t pp = expand_pred_h(pg[H1(i)]);
3587         d[i] = (nn & pp) | (mm & ~pp);
3588     }
3589 }
3590 
3591 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3592                             void *vg, uint32_t desc)
3593 {
3594     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3595     uint64_t *d = vd, *n = vn, *m = vm;
3596     uint8_t *pg = vg;
3597 
3598     for (i = 0; i < opr_sz; i += 1) {
3599         uint64_t nn = n[i], mm = m[i];
3600         uint64_t pp = expand_pred_s(pg[H1(i)]);
3601         d[i] = (nn & pp) | (mm & ~pp);
3602     }
3603 }
3604 
3605 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3606                             void *vg, uint32_t desc)
3607 {
3608     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3609     uint64_t *d = vd, *n = vn, *m = vm;
3610     uint8_t *pg = vg;
3611 
3612     for (i = 0; i < opr_sz; i += 1) {
3613         uint64_t nn = n[i], mm = m[i];
3614         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3615     }
3616 }
3617 
3618 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3619                             void *vg, uint32_t desc)
3620 {
3621     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3622     Int128 *d = vd, *n = vn, *m = vm;
3623     uint16_t *pg = vg;
3624 
3625     for (i = 0; i < opr_sz; i += 1) {
3626         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3627     }
3628 }
3629 
3630 /* Two operand comparison controlled by a predicate.
3631  * ??? It is very tempting to want to be able to expand this inline
3632  * with x86 instructions, e.g.
3633  *
3634  *    vcmpeqw    zm, zn, %ymm0
3635  *    vpmovmskb  %ymm0, %eax
3636  *    and        $0x5555, %eax
3637  *    and        pg, %eax
3638  *
3639  * or even aarch64, e.g.
3640  *
3641  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3642  *    cmeq       v0.8h, zn, zm
3643  *    and        v0.8h, v0.8h, mask
3644  *    addv       h0, v0.8h
3645  *    and        v0.8b, pg
3646  *
3647  * However, coming up with an abstraction that allows vector inputs and
3648  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3649  * scalar outputs, is tricky.
3650  */
3651 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3652 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3653 {                                                                            \
3654     intptr_t opr_sz = simd_oprsz(desc);                                      \
3655     uint32_t flags = PREDTEST_INIT;                                          \
3656     intptr_t i = opr_sz;                                                     \
3657     do {                                                                     \
3658         uint64_t out = 0, pg;                                                \
3659         do {                                                                 \
3660             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3661             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3662             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3663             out |= nn OP mm;                                                 \
3664         } while (i & 63);                                                    \
3665         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3666         out &= pg;                                                           \
3667         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3668         flags = iter_predtest_bwd(out, pg, flags);                           \
3669     } while (i > 0);                                                         \
3670     return flags;                                                            \
3671 }
3672 
3673 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3674     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3675 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3676     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3677 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3678     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3679 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3680     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3681 
3682 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3683 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3684 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3685 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3686 
3687 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3688 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3689 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3690 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3691 
3692 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3693 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3694 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3695 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3696 
3697 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3698 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3699 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3700 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3701 
3702 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3703 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3704 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3705 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3706 
3707 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3708 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3709 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3710 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3711 
3712 #undef DO_CMP_PPZZ_B
3713 #undef DO_CMP_PPZZ_H
3714 #undef DO_CMP_PPZZ_S
3715 #undef DO_CMP_PPZZ_D
3716 #undef DO_CMP_PPZZ
3717 
3718 /* Similar, but the second source is "wide".  */
3719 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3720 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3721 {                                                                            \
3722     intptr_t opr_sz = simd_oprsz(desc);                                      \
3723     uint32_t flags = PREDTEST_INIT;                                          \
3724     intptr_t i = opr_sz;                                                     \
3725     do {                                                                     \
3726         uint64_t out = 0, pg;                                                \
3727         do {                                                                 \
3728             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3729             do {                                                             \
3730                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3731                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3732                 out |= nn OP mm;                                             \
3733             } while (i & 7);                                                 \
3734         } while (i & 63);                                                    \
3735         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3736         out &= pg;                                                           \
3737         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3738         flags = iter_predtest_bwd(out, pg, flags);                           \
3739     } while (i > 0);                                                         \
3740     return flags;                                                            \
3741 }
3742 
3743 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3744     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3745 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3746     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3747 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3748     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3749 
3750 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3751 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3752 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3753 
3754 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3755 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3756 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3757 
3758 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3759 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3760 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3761 
3762 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3763 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3764 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3765 
3766 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3767 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3768 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3769 
3770 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3771 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3772 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3773 
3774 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3775 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3776 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3777 
3778 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3779 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3780 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3781 
3782 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3783 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3784 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3785 
3786 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3787 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3788 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3789 
3790 #undef DO_CMP_PPZW_B
3791 #undef DO_CMP_PPZW_H
3792 #undef DO_CMP_PPZW_S
3793 #undef DO_CMP_PPZW
3794 
3795 /* Similar, but the second source is immediate.  */
3796 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3797 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3798 {                                                                    \
3799     intptr_t opr_sz = simd_oprsz(desc);                              \
3800     uint32_t flags = PREDTEST_INIT;                                  \
3801     TYPE mm = simd_data(desc);                                       \
3802     intptr_t i = opr_sz;                                             \
3803     do {                                                             \
3804         uint64_t out = 0, pg;                                        \
3805         do {                                                         \
3806             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3807             TYPE nn = *(TYPE *)(vn + H(i));                          \
3808             out |= nn OP mm;                                         \
3809         } while (i & 63);                                            \
3810         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3811         out &= pg;                                                   \
3812         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3813         flags = iter_predtest_bwd(out, pg, flags);                   \
3814     } while (i > 0);                                                 \
3815     return flags;                                                    \
3816 }
3817 
3818 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3819     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3820 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3821     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3822 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3823     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3824 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3825     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3826 
3827 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3828 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3829 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3830 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3831 
3832 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3833 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3834 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3835 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3836 
3837 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3838 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3839 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3840 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3841 
3842 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3843 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3844 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3845 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3846 
3847 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3848 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3849 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3850 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3851 
3852 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3853 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3854 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3855 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3856 
3857 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3858 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3859 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3860 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3861 
3862 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3863 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3864 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3865 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3866 
3867 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3868 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3869 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3870 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3871 
3872 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3873 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3874 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3875 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3876 
3877 #undef DO_CMP_PPZI_B
3878 #undef DO_CMP_PPZI_H
3879 #undef DO_CMP_PPZI_S
3880 #undef DO_CMP_PPZI_D
3881 #undef DO_CMP_PPZI
3882 
3883 /* Similar to the ARM LastActive pseudocode function.  */
3884 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3885 {
3886     intptr_t i;
3887 
3888     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3889         uint64_t pg = *(uint64_t *)(vg + i);
3890         if (pg) {
3891             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3892         }
3893     }
3894     return 0;
3895 }
3896 
3897 /* Compute a mask into RETB that is true for all G, up to and including
3898  * (if after) or excluding (if !after) the first G & N.
3899  * Return true if BRK found.
3900  */
3901 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3902                         bool brk, bool after)
3903 {
3904     uint64_t b;
3905 
3906     if (brk) {
3907         b = 0;
3908     } else if ((g & n) == 0) {
3909         /* For all G, no N are set; break not found.  */
3910         b = g;
3911     } else {
3912         /* Break somewhere in N.  Locate it.  */
3913         b = g & n;            /* guard true, pred true */
3914         b = b & -b;           /* first such */
3915         if (after) {
3916             b = b | (b - 1);  /* break after same */
3917         } else {
3918             b = b - 1;        /* break before same */
3919         }
3920         brk = true;
3921     }
3922 
3923     *retb = b;
3924     return brk;
3925 }
3926 
3927 /* Compute a zeroing BRK.  */
3928 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3929                           intptr_t oprsz, bool after)
3930 {
3931     bool brk = false;
3932     intptr_t i;
3933 
3934     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3935         uint64_t this_b, this_g = g[i];
3936 
3937         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3938         d[i] = this_b & this_g;
3939     }
3940 }
3941 
3942 /* Likewise, but also compute flags.  */
3943 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3944                                intptr_t oprsz, bool after)
3945 {
3946     uint32_t flags = PREDTEST_INIT;
3947     bool brk = false;
3948     intptr_t i;
3949 
3950     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3951         uint64_t this_b, this_d, this_g = g[i];
3952 
3953         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3954         d[i] = this_d = this_b & this_g;
3955         flags = iter_predtest_fwd(this_d, this_g, flags);
3956     }
3957     return flags;
3958 }
3959 
3960 /* Compute a merging BRK.  */
3961 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3962                           intptr_t oprsz, bool after)
3963 {
3964     bool brk = false;
3965     intptr_t i;
3966 
3967     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3968         uint64_t this_b, this_g = g[i];
3969 
3970         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3971         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3972     }
3973 }
3974 
3975 /* Likewise, but also compute flags.  */
3976 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3977                                intptr_t oprsz, bool after)
3978 {
3979     uint32_t flags = PREDTEST_INIT;
3980     bool brk = false;
3981     intptr_t i;
3982 
3983     for (i = 0; i < oprsz / 8; ++i) {
3984         uint64_t this_b, this_d = d[i], this_g = g[i];
3985 
3986         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3987         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3988         flags = iter_predtest_fwd(this_d, this_g, flags);
3989     }
3990     return flags;
3991 }
3992 
3993 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3994 {
3995     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3996      * The compiler should turn this into 4 64-bit integer stores.
3997      */
3998     memset(d, 0, sizeof(ARMPredicateReg));
3999     return PREDTEST_INIT;
4000 }
4001 
4002 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4003                        uint32_t pred_desc)
4004 {
4005     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4006     if (last_active_pred(vn, vg, oprsz)) {
4007         compute_brk_z(vd, vm, vg, oprsz, true);
4008     } else {
4009         do_zero(vd, oprsz);
4010     }
4011 }
4012 
4013 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4014                             uint32_t pred_desc)
4015 {
4016     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4017     if (last_active_pred(vn, vg, oprsz)) {
4018         return compute_brks_z(vd, vm, vg, oprsz, true);
4019     } else {
4020         return do_zero(vd, oprsz);
4021     }
4022 }
4023 
4024 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4025                        uint32_t pred_desc)
4026 {
4027     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4028     if (last_active_pred(vn, vg, oprsz)) {
4029         compute_brk_z(vd, vm, vg, oprsz, false);
4030     } else {
4031         do_zero(vd, oprsz);
4032     }
4033 }
4034 
4035 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4036                             uint32_t pred_desc)
4037 {
4038     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4039     if (last_active_pred(vn, vg, oprsz)) {
4040         return compute_brks_z(vd, vm, vg, oprsz, false);
4041     } else {
4042         return do_zero(vd, oprsz);
4043     }
4044 }
4045 
4046 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4047 {
4048     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4049     compute_brk_z(vd, vn, vg, oprsz, true);
4050 }
4051 
4052 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4053 {
4054     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4055     return compute_brks_z(vd, vn, vg, oprsz, true);
4056 }
4057 
4058 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4059 {
4060     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4061     compute_brk_z(vd, vn, vg, oprsz, false);
4062 }
4063 
4064 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4065 {
4066     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4067     return compute_brks_z(vd, vn, vg, oprsz, false);
4068 }
4069 
4070 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4071 {
4072     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4073     compute_brk_m(vd, vn, vg, oprsz, true);
4074 }
4075 
4076 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4077 {
4078     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4079     return compute_brks_m(vd, vn, vg, oprsz, true);
4080 }
4081 
4082 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4083 {
4084     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4085     compute_brk_m(vd, vn, vg, oprsz, false);
4086 }
4087 
4088 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4089 {
4090     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4091     return compute_brks_m(vd, vn, vg, oprsz, false);
4092 }
4093 
4094 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4095 {
4096     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4097     if (!last_active_pred(vn, vg, oprsz)) {
4098         do_zero(vd, oprsz);
4099     }
4100 }
4101 
4102 /* As if PredTest(Ones(PL), D, esz).  */
4103 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4104                               uint64_t esz_mask)
4105 {
4106     uint32_t flags = PREDTEST_INIT;
4107     intptr_t i;
4108 
4109     for (i = 0; i < oprsz / 8; i++) {
4110         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4111     }
4112     if (oprsz & 7) {
4113         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4114         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4115     }
4116     return flags;
4117 }
4118 
4119 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4120 {
4121     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4122     if (last_active_pred(vn, vg, oprsz)) {
4123         return predtest_ones(vd, oprsz, -1);
4124     } else {
4125         return do_zero(vd, oprsz);
4126     }
4127 }
4128 
4129 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4130 {
4131     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4132     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4133     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4134     intptr_t i;
4135 
4136     for (i = 0; i < words; ++i) {
4137         uint64_t t = n[i] & g[i] & mask;
4138         sum += ctpop64(t);
4139     }
4140     return sum;
4141 }
4142 
4143 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4144 {
4145     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4146     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4147     uint64_t esz_mask = pred_esz_masks[esz];
4148     ARMPredicateReg *d = vd;
4149     uint32_t flags;
4150     intptr_t i;
4151 
4152     /* Begin with a zero predicate register.  */
4153     flags = do_zero(d, oprsz);
4154     if (count == 0) {
4155         return flags;
4156     }
4157 
4158     /* Set all of the requested bits.  */
4159     for (i = 0; i < count / 64; ++i) {
4160         d->p[i] = esz_mask;
4161     }
4162     if (count & 63) {
4163         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4164     }
4165 
4166     return predtest_ones(d, oprsz, esz_mask);
4167 }
4168 
4169 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4170 {
4171     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4172     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4173     uint64_t esz_mask = pred_esz_masks[esz];
4174     ARMPredicateReg *d = vd;
4175     intptr_t i, invcount, oprbits;
4176     uint64_t bits;
4177 
4178     if (count == 0) {
4179         return do_zero(d, oprsz);
4180     }
4181 
4182     oprbits = oprsz * 8;
4183     tcg_debug_assert(count <= oprbits);
4184 
4185     bits = esz_mask;
4186     if (oprbits & 63) {
4187         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4188     }
4189 
4190     invcount = oprbits - count;
4191     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4192         d->p[i] = bits;
4193         bits = esz_mask;
4194     }
4195 
4196     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4197 
4198     while (--i >= 0) {
4199         d->p[i] = 0;
4200     }
4201 
4202     return predtest_ones(d, oprsz, esz_mask);
4203 }
4204 
4205 /* Recursive reduction on a function;
4206  * C.f. the ARM ARM function ReducePredicated.
4207  *
4208  * While it would be possible to write this without the DATA temporary,
4209  * it is much simpler to process the predicate register this way.
4210  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4211  * little to gain with a more complex non-recursive form.
4212  */
4213 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4214 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4215 {                                                                     \
4216     if (n == 1) {                                                     \
4217         return *data;                                                 \
4218     } else {                                                          \
4219         uintptr_t half = n / 2;                                       \
4220         TYPE lo = NAME##_reduce(data, status, half);                  \
4221         TYPE hi = NAME##_reduce(data + half, status, half);           \
4222         return FUNC(lo, hi, status);                                  \
4223     }                                                                 \
4224 }                                                                     \
4225 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
4226 {                                                                     \
4227     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4228     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4229     for (i = 0; i < oprsz; ) {                                        \
4230         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4231         do {                                                          \
4232             TYPE nn = *(TYPE *)(vn + H(i));                           \
4233             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4234             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4235         } while (i & 15);                                             \
4236     }                                                                 \
4237     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4238         *(TYPE *)((void *)data + i) = IDENT;                          \
4239     }                                                                 \
4240     return NAME##_reduce(data, s, maxsz / sizeof(TYPE));              \
4241 }
4242 
4243 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
4244 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
4245 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
4246 
4247 /* Identity is floatN_default_nan, without the function call.  */
4248 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
4249 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
4250 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
4251 
4252 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
4253 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
4254 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
4255 
4256 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
4257 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
4258 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
4259 
4260 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4261 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4262 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4263 
4264 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4265 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4266 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4267 
4268 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
4269           float16_chs(float16_infinity))
4270 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
4271           float32_chs(float32_infinity))
4272 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
4273           float64_chs(float64_infinity))
4274 
4275 #undef DO_REDUCE
4276 
4277 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4278                              float_status *status, uint32_t desc)
4279 {
4280     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4281     float16 result = nn;
4282 
4283     do {
4284         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4285         do {
4286             if (pg & 1) {
4287                 float16 mm = *(float16 *)(vm + H1_2(i));
4288                 result = float16_add(result, mm, status);
4289             }
4290             i += sizeof(float16), pg >>= sizeof(float16);
4291         } while (i & 15);
4292     } while (i < opr_sz);
4293 
4294     return result;
4295 }
4296 
4297 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4298                              float_status *status, uint32_t desc)
4299 {
4300     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4301     float32 result = nn;
4302 
4303     do {
4304         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4305         do {
4306             if (pg & 1) {
4307                 float32 mm = *(float32 *)(vm + H1_2(i));
4308                 result = float32_add(result, mm, status);
4309             }
4310             i += sizeof(float32), pg >>= sizeof(float32);
4311         } while (i & 15);
4312     } while (i < opr_sz);
4313 
4314     return result;
4315 }
4316 
4317 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4318                              float_status *status, uint32_t desc)
4319 {
4320     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4321     uint64_t *m = vm;
4322     uint8_t *pg = vg;
4323 
4324     for (i = 0; i < opr_sz; i++) {
4325         if (pg[H1(i)] & 1) {
4326             nn = float64_add(nn, m[i], status);
4327         }
4328     }
4329 
4330     return nn;
4331 }
4332 
4333 /* Fully general three-operand expander, controlled by a predicate,
4334  * With the extra float_status parameter.
4335  */
4336 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4337 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4338                   float_status *status, uint32_t desc)          \
4339 {                                                               \
4340     intptr_t i = simd_oprsz(desc);                              \
4341     uint64_t *g = vg;                                           \
4342     do {                                                        \
4343         uint64_t pg = g[(i - 1) >> 6];                          \
4344         do {                                                    \
4345             i -= sizeof(TYPE);                                  \
4346             if (likely((pg >> (i & 63)) & 1)) {                 \
4347                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4348                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4349                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4350             }                                                   \
4351         } while (i & 63);                                       \
4352     } while (i != 0);                                           \
4353 }
4354 
4355 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4356 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4357 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4358 
4359 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4360 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4361 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4362 
4363 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4364 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4365 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4366 
4367 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4368 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4369 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4370 
4371 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4372 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4373 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4374 
4375 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4376 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4377 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4378 
4379 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4380 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4381 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4382 
4383 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4384 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4385 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4386 
4387 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4388 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4389 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4390 
4391 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4392 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4393 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4394 
4395 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4396 {
4397     return float16_abs(float16_sub(a, b, s));
4398 }
4399 
4400 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4401 {
4402     return float32_abs(float32_sub(a, b, s));
4403 }
4404 
4405 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4406 {
4407     return float64_abs(float64_sub(a, b, s));
4408 }
4409 
4410 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
4411 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4412 {
4413     float16 r = float16_sub(op1, op2, stat);
4414     return float16_is_any_nan(r) ? r : float16_abs(r);
4415 }
4416 
4417 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4418 {
4419     float32 r = float32_sub(op1, op2, stat);
4420     return float32_is_any_nan(r) ? r : float32_abs(r);
4421 }
4422 
4423 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4424 {
4425     float64 r = float64_sub(op1, op2, stat);
4426     return float64_is_any_nan(r) ? r : float64_abs(r);
4427 }
4428 
4429 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4430 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4431 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4432 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4433 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4434 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4435 
4436 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4437 {
4438     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4439     return float64_scalbn(a, b_int, s);
4440 }
4441 
4442 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4443 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4444 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4445 
4446 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4447 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4448 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4449 
4450 #undef DO_ZPZZ_FP
4451 
4452 /* Three-operand expander, with one scalar operand, controlled by
4453  * a predicate, with the extra float_status parameter.
4454  */
4455 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4456 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4457                   float_status *status, uint32_t desc)            \
4458 {                                                                 \
4459     intptr_t i = simd_oprsz(desc);                                \
4460     uint64_t *g = vg;                                             \
4461     TYPE mm = scalar;                                             \
4462     do {                                                          \
4463         uint64_t pg = g[(i - 1) >> 6];                            \
4464         do {                                                      \
4465             i -= sizeof(TYPE);                                    \
4466             if (likely((pg >> (i & 63)) & 1)) {                   \
4467                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4468                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4469             }                                                     \
4470         } while (i & 63);                                         \
4471     } while (i != 0);                                             \
4472 }
4473 
4474 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4475 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4476 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4477 
4478 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4479 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4480 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4481 
4482 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4483 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4484 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4485 
4486 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4487 {
4488     return float16_sub(b, a, s);
4489 }
4490 
4491 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4492 {
4493     return float32_sub(b, a, s);
4494 }
4495 
4496 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4497 {
4498     return float64_sub(b, a, s);
4499 }
4500 
4501 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4502 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4503 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4504 
4505 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4506 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4507 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4508 
4509 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4510 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4511 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4512 
4513 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4514 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4515 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4516 
4517 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4518 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4519 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4520 
4521 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4522 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4523 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4524 
4525 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4526 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4527 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4528 
4529 /* Fully general two-operand expander, controlled by a predicate,
4530  * With the extra float_status parameter.
4531  */
4532 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4533 void HELPER(NAME)(void *vd, void *vn, void *vg,                       \
4534                   float_status *status, uint32_t desc)                \
4535 {                                                                     \
4536     intptr_t i = simd_oprsz(desc);                                    \
4537     uint64_t *g = vg;                                                 \
4538     do {                                                              \
4539         uint64_t pg = g[(i - 1) >> 6];                                \
4540         do {                                                          \
4541             i -= sizeof(TYPE);                                        \
4542             if (likely((pg >> (i & 63)) & 1)) {                       \
4543                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4544                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4545             }                                                         \
4546         } while (i & 63);                                             \
4547     } while (i != 0);                                                 \
4548 }
4549 
4550 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4551  * FZ16.  When converting from fp16, this affects flushing input denormals;
4552  * when converting to fp16, this affects flushing output denormals.
4553  */
4554 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4555 {
4556     bool save = get_flush_inputs_to_zero(fpst);
4557     float32 ret;
4558 
4559     set_flush_inputs_to_zero(false, fpst);
4560     ret = float16_to_float32(f, true, fpst);
4561     set_flush_inputs_to_zero(save, fpst);
4562     return ret;
4563 }
4564 
4565 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4566 {
4567     bool save = get_flush_inputs_to_zero(fpst);
4568     float64 ret;
4569 
4570     set_flush_inputs_to_zero(false, fpst);
4571     ret = float16_to_float64(f, true, fpst);
4572     set_flush_inputs_to_zero(save, fpst);
4573     return ret;
4574 }
4575 
4576 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4577 {
4578     bool save = get_flush_to_zero(fpst);
4579     float16 ret;
4580 
4581     set_flush_to_zero(false, fpst);
4582     ret = float32_to_float16(f, true, fpst);
4583     set_flush_to_zero(save, fpst);
4584     return ret;
4585 }
4586 
4587 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4588 {
4589     bool save = get_flush_to_zero(fpst);
4590     float16 ret;
4591 
4592     set_flush_to_zero(false, fpst);
4593     ret = float64_to_float16(f, true, fpst);
4594     set_flush_to_zero(save, fpst);
4595     return ret;
4596 }
4597 
4598 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4599 {
4600     if (float16_is_any_nan(f)) {
4601         float_raise(float_flag_invalid, s);
4602         return 0;
4603     }
4604     return float16_to_int16_round_to_zero(f, s);
4605 }
4606 
4607 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4608 {
4609     if (float16_is_any_nan(f)) {
4610         float_raise(float_flag_invalid, s);
4611         return 0;
4612     }
4613     return float16_to_int64_round_to_zero(f, s);
4614 }
4615 
4616 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4617 {
4618     if (float32_is_any_nan(f)) {
4619         float_raise(float_flag_invalid, s);
4620         return 0;
4621     }
4622     return float32_to_int64_round_to_zero(f, s);
4623 }
4624 
4625 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4626 {
4627     if (float64_is_any_nan(f)) {
4628         float_raise(float_flag_invalid, s);
4629         return 0;
4630     }
4631     return float64_to_int64_round_to_zero(f, s);
4632 }
4633 
4634 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4635 {
4636     if (float16_is_any_nan(f)) {
4637         float_raise(float_flag_invalid, s);
4638         return 0;
4639     }
4640     return float16_to_uint16_round_to_zero(f, s);
4641 }
4642 
4643 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4644 {
4645     if (float16_is_any_nan(f)) {
4646         float_raise(float_flag_invalid, s);
4647         return 0;
4648     }
4649     return float16_to_uint64_round_to_zero(f, s);
4650 }
4651 
4652 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4653 {
4654     if (float32_is_any_nan(f)) {
4655         float_raise(float_flag_invalid, s);
4656         return 0;
4657     }
4658     return float32_to_uint64_round_to_zero(f, s);
4659 }
4660 
4661 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4662 {
4663     if (float64_is_any_nan(f)) {
4664         float_raise(float_flag_invalid, s);
4665         return 0;
4666     }
4667     return float64_to_uint64_round_to_zero(f, s);
4668 }
4669 
4670 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4671 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4672 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4673 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4674 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4675 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4676 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4677 
4678 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4679 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4680 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4681 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4682 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4683 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4684 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4685 
4686 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4687 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4688 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4689 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4690 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4691 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4692 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4693 
4694 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4695 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4696 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4697 
4698 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4699 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4700 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4701 
4702 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4703 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4704 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4705 
4706 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4707 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4708 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4709 
4710 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4711 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4712 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4713 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4714 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4715 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4716 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4717 
4718 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4719 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4720 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4721 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4722 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4723 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4724 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4725 
4726 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4727 {
4728     /* Extract frac to the top of the uint32_t. */
4729     uint32_t frac = (uint32_t)a << (16 + 6);
4730     int16_t exp = extract32(a, 10, 5);
4731 
4732     if (unlikely(exp == 0)) {
4733         if (frac != 0) {
4734             if (!get_flush_inputs_to_zero(s)) {
4735                 /* denormal: bias - fractional_zeros */
4736                 return -15 - clz32(frac);
4737             }
4738             /* flush to zero */
4739             float_raise(float_flag_input_denormal_flushed, s);
4740         }
4741     } else if (unlikely(exp == 0x1f)) {
4742         if (frac == 0) {
4743             return INT16_MAX; /* infinity */
4744         }
4745     } else {
4746         /* normal: exp - bias */
4747         return exp - 15;
4748     }
4749     /* nan or zero */
4750     float_raise(float_flag_invalid, s);
4751     return INT16_MIN;
4752 }
4753 
4754 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4755 {
4756     /* Extract frac to the top of the uint32_t. */
4757     uint32_t frac = a << 9;
4758     int32_t exp = extract32(a, 23, 8);
4759 
4760     if (unlikely(exp == 0)) {
4761         if (frac != 0) {
4762             if (!get_flush_inputs_to_zero(s)) {
4763                 /* denormal: bias - fractional_zeros */
4764                 return -127 - clz32(frac);
4765             }
4766             /* flush to zero */
4767             float_raise(float_flag_input_denormal_flushed, s);
4768         }
4769     } else if (unlikely(exp == 0xff)) {
4770         if (frac == 0) {
4771             return INT32_MAX; /* infinity */
4772         }
4773     } else {
4774         /* normal: exp - bias */
4775         return exp - 127;
4776     }
4777     /* nan or zero */
4778     float_raise(float_flag_invalid, s);
4779     return INT32_MIN;
4780 }
4781 
4782 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4783 {
4784     /* Extract frac to the top of the uint64_t. */
4785     uint64_t frac = a << 12;
4786     int64_t exp = extract64(a, 52, 11);
4787 
4788     if (unlikely(exp == 0)) {
4789         if (frac != 0) {
4790             if (!get_flush_inputs_to_zero(s)) {
4791                 /* denormal: bias - fractional_zeros */
4792                 return -1023 - clz64(frac);
4793             }
4794             /* flush to zero */
4795             float_raise(float_flag_input_denormal_flushed, s);
4796         }
4797     } else if (unlikely(exp == 0x7ff)) {
4798         if (frac == 0) {
4799             return INT64_MAX; /* infinity */
4800         }
4801     } else {
4802         /* normal: exp - bias */
4803         return exp - 1023;
4804     }
4805     /* nan or zero */
4806     float_raise(float_flag_invalid, s);
4807     return INT64_MIN;
4808 }
4809 
4810 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4811 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4812 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4813 
4814 #undef DO_ZPZ_FP
4815 
4816 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4817                             float_status *status, uint32_t desc,
4818                             uint16_t neg1, uint16_t neg3, int flags)
4819 {
4820     intptr_t i = simd_oprsz(desc);
4821     uint64_t *g = vg;
4822 
4823     do {
4824         uint64_t pg = g[(i - 1) >> 6];
4825         do {
4826             i -= 2;
4827             if (likely((pg >> (i & 63)) & 1)) {
4828                 float16 e1, e2, e3, r;
4829 
4830                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4831                 e2 = *(uint16_t *)(vm + H1_2(i));
4832                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4833                 r = float16_muladd(e1, e2, e3, flags, status);
4834                 *(uint16_t *)(vd + H1_2(i)) = r;
4835             }
4836         } while (i & 63);
4837     } while (i != 0);
4838 }
4839 
4840 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4841                               void *vg, float_status *status, uint32_t desc)
4842 {
4843     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4844 }
4845 
4846 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4847                               void *vg, float_status *status, uint32_t desc)
4848 {
4849     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
4850 }
4851 
4852 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4853                                void *vg, float_status *status, uint32_t desc)
4854 {
4855     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
4856 }
4857 
4858 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4859                                void *vg, float_status *status, uint32_t desc)
4860 {
4861     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
4862 }
4863 
4864 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4865                               void *vg, float_status *status, uint32_t desc)
4866 {
4867     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4868                     float_muladd_negate_product);
4869 }
4870 
4871 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4872                                void *vg, float_status *status, uint32_t desc)
4873 {
4874     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4875                     float_muladd_negate_product | float_muladd_negate_c);
4876 }
4877 
4878 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4879                                void *vg, float_status *status, uint32_t desc)
4880 {
4881     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4882                     float_muladd_negate_c);
4883 }
4884 
4885 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4886                             float_status *status, uint32_t desc,
4887                             uint32_t neg1, uint32_t neg3, int flags)
4888 {
4889     intptr_t i = simd_oprsz(desc);
4890     uint64_t *g = vg;
4891 
4892     do {
4893         uint64_t pg = g[(i - 1) >> 6];
4894         do {
4895             i -= 4;
4896             if (likely((pg >> (i & 63)) & 1)) {
4897                 float32 e1, e2, e3, r;
4898 
4899                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4900                 e2 = *(uint32_t *)(vm + H1_4(i));
4901                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4902                 r = float32_muladd(e1, e2, e3, flags, status);
4903                 *(uint32_t *)(vd + H1_4(i)) = r;
4904             }
4905         } while (i & 63);
4906     } while (i != 0);
4907 }
4908 
4909 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4910                               void *vg, float_status *status, uint32_t desc)
4911 {
4912     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4913 }
4914 
4915 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4916                               void *vg, float_status *status, uint32_t desc)
4917 {
4918     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
4919 }
4920 
4921 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4922                                void *vg, float_status *status, uint32_t desc)
4923 {
4924     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
4925 }
4926 
4927 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4928                                void *vg, float_status *status, uint32_t desc)
4929 {
4930     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
4931 }
4932 
4933 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4934                               void *vg, float_status *status, uint32_t desc)
4935 {
4936     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4937                     float_muladd_negate_product);
4938 }
4939 
4940 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4941                                void *vg, float_status *status, uint32_t desc)
4942 {
4943     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4944                     float_muladd_negate_product | float_muladd_negate_c);
4945 }
4946 
4947 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4948                                void *vg, float_status *status, uint32_t desc)
4949 {
4950     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4951                     float_muladd_negate_c);
4952 }
4953 
4954 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4955                             float_status *status, uint32_t desc,
4956                             uint64_t neg1, uint64_t neg3, int flags)
4957 {
4958     intptr_t i = simd_oprsz(desc);
4959     uint64_t *g = vg;
4960 
4961     do {
4962         uint64_t pg = g[(i - 1) >> 6];
4963         do {
4964             i -= 8;
4965             if (likely((pg >> (i & 63)) & 1)) {
4966                 float64 e1, e2, e3, r;
4967 
4968                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4969                 e2 = *(uint64_t *)(vm + i);
4970                 e3 = *(uint64_t *)(va + i) ^ neg3;
4971                 r = float64_muladd(e1, e2, e3, flags, status);
4972                 *(uint64_t *)(vd + i) = r;
4973             }
4974         } while (i & 63);
4975     } while (i != 0);
4976 }
4977 
4978 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4979                               void *vg, float_status *status, uint32_t desc)
4980 {
4981     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4982 }
4983 
4984 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4985                               void *vg, float_status *status, uint32_t desc)
4986 {
4987     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
4988 }
4989 
4990 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4991                                void *vg, float_status *status, uint32_t desc)
4992 {
4993     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
4994 }
4995 
4996 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4997                                void *vg, float_status *status, uint32_t desc)
4998 {
4999     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
5000 }
5001 
5002 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5003                               void *vg, float_status *status, uint32_t desc)
5004 {
5005     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5006                     float_muladd_negate_product);
5007 }
5008 
5009 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5010                                void *vg, float_status *status, uint32_t desc)
5011 {
5012     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5013                     float_muladd_negate_product | float_muladd_negate_c);
5014 }
5015 
5016 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5017                                void *vg, float_status *status, uint32_t desc)
5018 {
5019     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5020                     float_muladd_negate_c);
5021 }
5022 
5023 /* Two operand floating-point comparison controlled by a predicate.
5024  * Unlike the integer version, we are not allowed to optimistically
5025  * compare operands, since the comparison may have side effects wrt
5026  * the FPSR.
5027  */
5028 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
5029 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
5030                   float_status *status, uint32_t desc)                  \
5031 {                                                                       \
5032     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
5033     uint64_t *d = vd, *g = vg;                                          \
5034     do {                                                                \
5035         uint64_t out = 0, pg = g[j];                                    \
5036         do {                                                            \
5037             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
5038             if (likely((pg >> (i & 63)) & 1)) {                         \
5039                 TYPE nn = *(TYPE *)(vn + H(i));                         \
5040                 TYPE mm = *(TYPE *)(vm + H(i));                         \
5041                 out |= OP(TYPE, nn, mm, status);                        \
5042             }                                                           \
5043         } while (i & 63);                                               \
5044         d[j--] = out;                                                   \
5045     } while (i > 0);                                                    \
5046 }
5047 
5048 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5049     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5050 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5051     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5052 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5053     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5054 
5055 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5056     DO_FPCMP_PPZZ_H(NAME, OP)   \
5057     DO_FPCMP_PPZZ_S(NAME, OP)   \
5058     DO_FPCMP_PPZZ_D(NAME, OP)
5059 
5060 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
5061 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
5062 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
5063 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
5064 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
5065 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
5066 #define DO_FCMUO(TYPE, X, Y, ST)  \
5067     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5068 #define DO_FACGE(TYPE, X, Y, ST)  \
5069     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5070 #define DO_FACGT(TYPE, X, Y, ST)  \
5071     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5072 
5073 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5074 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5075 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5076 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5077 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5078 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5079 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5080 
5081 #undef DO_FPCMP_PPZZ_ALL
5082 #undef DO_FPCMP_PPZZ_D
5083 #undef DO_FPCMP_PPZZ_S
5084 #undef DO_FPCMP_PPZZ_H
5085 #undef DO_FPCMP_PPZZ
5086 
5087 /* One operand floating-point comparison against zero, controlled
5088  * by a predicate.
5089  */
5090 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
5091 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
5092                   float_status *status, uint32_t desc)     \
5093 {                                                          \
5094     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
5095     uint64_t *d = vd, *g = vg;                             \
5096     do {                                                   \
5097         uint64_t out = 0, pg = g[j];                       \
5098         do {                                               \
5099             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
5100             if ((pg >> (i & 63)) & 1) {                    \
5101                 TYPE nn = *(TYPE *)(vn + H(i));            \
5102                 out |= OP(TYPE, nn, 0, status);            \
5103             }                                              \
5104         } while (i & 63);                                  \
5105         d[j--] = out;                                      \
5106     } while (i > 0);                                       \
5107 }
5108 
5109 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5110     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5111 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5112     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5113 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5114     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5115 
5116 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5117     DO_FPCMP_PPZ0_H(NAME, OP)   \
5118     DO_FPCMP_PPZ0_S(NAME, OP)   \
5119     DO_FPCMP_PPZ0_D(NAME, OP)
5120 
5121 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5122 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5123 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5124 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5125 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5126 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5127 
5128 /* FP Trig Multiply-Add. */
5129 
5130 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5131                          float_status *s, uint32_t desc)
5132 {
5133     static const float16 coeff[16] = {
5134         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5135         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5136     };
5137     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5138     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5139     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5140     float16 *d = vd, *n = vn, *m = vm;
5141 
5142     for (i = 0; i < opr_sz; i++) {
5143         float16 mm = m[i];
5144         intptr_t xx = x;
5145         int flags = 0;
5146 
5147         if (float16_is_neg(mm)) {
5148             if (fpcr_ah) {
5149                 flags = float_muladd_negate_product;
5150             } else {
5151                 mm = float16_abs(mm);
5152             }
5153             xx += 8;
5154         }
5155         d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5156     }
5157 }
5158 
5159 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5160                          float_status *s, uint32_t desc)
5161 {
5162     static const float32 coeff[16] = {
5163         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5164         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5165         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5166         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5167     };
5168     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5169     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5170     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5171     float32 *d = vd, *n = vn, *m = vm;
5172 
5173     for (i = 0; i < opr_sz; i++) {
5174         float32 mm = m[i];
5175         intptr_t xx = x;
5176         int flags = 0;
5177 
5178         if (float32_is_neg(mm)) {
5179             if (fpcr_ah) {
5180                 flags = float_muladd_negate_product;
5181             } else {
5182                 mm = float32_abs(mm);
5183             }
5184             xx += 8;
5185         }
5186         d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5187     }
5188 }
5189 
5190 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5191                          float_status *s, uint32_t desc)
5192 {
5193     static const float64 coeff[16] = {
5194         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5195         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5196         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5197         0x3de5d8408868552full, 0x0000000000000000ull,
5198         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5199         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5200         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5201         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5202     };
5203     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5204     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5205     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5206     float64 *d = vd, *n = vn, *m = vm;
5207 
5208     for (i = 0; i < opr_sz; i++) {
5209         float64 mm = m[i];
5210         intptr_t xx = x;
5211         int flags = 0;
5212 
5213         if (float64_is_neg(mm)) {
5214             if (fpcr_ah) {
5215                 flags = float_muladd_negate_product;
5216             } else {
5217                 mm = float64_abs(mm);
5218             }
5219             xx += 8;
5220         }
5221         d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5222     }
5223 }
5224 
5225 /*
5226  * FP Complex Add
5227  */
5228 
5229 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5230                          float_status *s, uint32_t desc)
5231 {
5232     intptr_t j, i = simd_oprsz(desc);
5233     uint64_t *g = vg;
5234     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5235     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5236 
5237     do {
5238         uint64_t pg = g[(i - 1) >> 6];
5239         do {
5240             float16 e0, e1, e2, e3;
5241 
5242             /* I holds the real index; J holds the imag index.  */
5243             j = i - sizeof(float16);
5244             i -= 2 * sizeof(float16);
5245 
5246             e0 = *(float16 *)(vn + H1_2(i));
5247             e1 = *(float16 *)(vm + H1_2(j));
5248             e2 = *(float16 *)(vn + H1_2(j));
5249             e3 = *(float16 *)(vm + H1_2(i));
5250 
5251             if (rot) {
5252                 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5253             } else {
5254                 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5255             }
5256 
5257             if (likely((pg >> (i & 63)) & 1)) {
5258                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5259             }
5260             if (likely((pg >> (j & 63)) & 1)) {
5261                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5262             }
5263         } while (i & 63);
5264     } while (i != 0);
5265 }
5266 
5267 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5268                          float_status *s, uint32_t desc)
5269 {
5270     intptr_t j, i = simd_oprsz(desc);
5271     uint64_t *g = vg;
5272     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5273     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5274 
5275     do {
5276         uint64_t pg = g[(i - 1) >> 6];
5277         do {
5278             float32 e0, e1, e2, e3;
5279 
5280             /* I holds the real index; J holds the imag index.  */
5281             j = i - sizeof(float32);
5282             i -= 2 * sizeof(float32);
5283 
5284             e0 = *(float32 *)(vn + H1_2(i));
5285             e1 = *(float32 *)(vm + H1_2(j));
5286             e2 = *(float32 *)(vn + H1_2(j));
5287             e3 = *(float32 *)(vm + H1_2(i));
5288 
5289             if (rot) {
5290                 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5291             } else {
5292                 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5293             }
5294 
5295             if (likely((pg >> (i & 63)) & 1)) {
5296                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5297             }
5298             if (likely((pg >> (j & 63)) & 1)) {
5299                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5300             }
5301         } while (i & 63);
5302     } while (i != 0);
5303 }
5304 
5305 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5306                          float_status *s, uint32_t desc)
5307 {
5308     intptr_t j, i = simd_oprsz(desc);
5309     uint64_t *g = vg;
5310     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5311     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5312 
5313     do {
5314         uint64_t pg = g[(i - 1) >> 6];
5315         do {
5316             float64 e0, e1, e2, e3;
5317 
5318             /* I holds the real index; J holds the imag index.  */
5319             j = i - sizeof(float64);
5320             i -= 2 * sizeof(float64);
5321 
5322             e0 = *(float64 *)(vn + H1_2(i));
5323             e1 = *(float64 *)(vm + H1_2(j));
5324             e2 = *(float64 *)(vn + H1_2(j));
5325             e3 = *(float64 *)(vm + H1_2(i));
5326 
5327             if (rot) {
5328                 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5329             } else {
5330                 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5331             }
5332 
5333             if (likely((pg >> (i & 63)) & 1)) {
5334                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5335             }
5336             if (likely((pg >> (j & 63)) & 1)) {
5337                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5338             }
5339         } while (i & 63);
5340     } while (i != 0);
5341 }
5342 
5343 /*
5344  * FP Complex Multiply
5345  */
5346 
5347 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5348                                void *vg, float_status *status, uint32_t desc)
5349 {
5350     intptr_t j, i = simd_oprsz(desc);
5351     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5352     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5353     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5354     uint32_t negf_real = flip ^ negf_imag;
5355     float16 negx_imag, negx_real;
5356     uint64_t *g = vg;
5357 
5358     /* With AH=0, use negx; with AH=1 use negf. */
5359     negx_real = (negf_real & ~fpcr_ah) << 15;
5360     negx_imag = (negf_imag & ~fpcr_ah) << 15;
5361     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5362     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5363 
5364     do {
5365         uint64_t pg = g[(i - 1) >> 6];
5366         do {
5367             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5368 
5369             /* I holds the real index; J holds the imag index.  */
5370             j = i - sizeof(float16);
5371             i -= 2 * sizeof(float16);
5372 
5373             nr = *(float16 *)(vn + H1_2(i));
5374             ni = *(float16 *)(vn + H1_2(j));
5375             mr = *(float16 *)(vm + H1_2(i));
5376             mi = *(float16 *)(vm + H1_2(j));
5377 
5378             e2 = (flip ? ni : nr);
5379             e1 = (flip ? mi : mr) ^ negx_real;
5380             e4 = e2;
5381             e3 = (flip ? mr : mi) ^ negx_imag;
5382 
5383             if (likely((pg >> (i & 63)) & 1)) {
5384                 d = *(float16 *)(va + H1_2(i));
5385                 d = float16_muladd(e2, e1, d, negf_real, status);
5386                 *(float16 *)(vd + H1_2(i)) = d;
5387             }
5388             if (likely((pg >> (j & 63)) & 1)) {
5389                 d = *(float16 *)(va + H1_2(j));
5390                 d = float16_muladd(e4, e3, d, negf_imag, status);
5391                 *(float16 *)(vd + H1_2(j)) = d;
5392             }
5393         } while (i & 63);
5394     } while (i != 0);
5395 }
5396 
5397 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5398                                void *vg, float_status *status, uint32_t desc)
5399 {
5400     intptr_t j, i = simd_oprsz(desc);
5401     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5402     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5403     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5404     uint32_t negf_real = flip ^ negf_imag;
5405     float32 negx_imag, negx_real;
5406     uint64_t *g = vg;
5407 
5408     /* With AH=0, use negx; with AH=1 use negf. */
5409     negx_real = (negf_real & ~fpcr_ah) << 31;
5410     negx_imag = (negf_imag & ~fpcr_ah) << 31;
5411     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5412     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5413 
5414     do {
5415         uint64_t pg = g[(i - 1) >> 6];
5416         do {
5417             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5418 
5419             /* I holds the real index; J holds the imag index.  */
5420             j = i - sizeof(float32);
5421             i -= 2 * sizeof(float32);
5422 
5423             nr = *(float32 *)(vn + H1_2(i));
5424             ni = *(float32 *)(vn + H1_2(j));
5425             mr = *(float32 *)(vm + H1_2(i));
5426             mi = *(float32 *)(vm + H1_2(j));
5427 
5428             e2 = (flip ? ni : nr);
5429             e1 = (flip ? mi : mr) ^ negx_real;
5430             e4 = e2;
5431             e3 = (flip ? mr : mi) ^ negx_imag;
5432 
5433             if (likely((pg >> (i & 63)) & 1)) {
5434                 d = *(float32 *)(va + H1_2(i));
5435                 d = float32_muladd(e2, e1, d, negf_real, status);
5436                 *(float32 *)(vd + H1_2(i)) = d;
5437             }
5438             if (likely((pg >> (j & 63)) & 1)) {
5439                 d = *(float32 *)(va + H1_2(j));
5440                 d = float32_muladd(e4, e3, d, negf_imag, status);
5441                 *(float32 *)(vd + H1_2(j)) = d;
5442             }
5443         } while (i & 63);
5444     } while (i != 0);
5445 }
5446 
5447 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5448                                void *vg, float_status *status, uint32_t desc)
5449 {
5450     intptr_t j, i = simd_oprsz(desc);
5451     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5452     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5453     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5454     uint32_t negf_real = flip ^ negf_imag;
5455     float64 negx_imag, negx_real;
5456     uint64_t *g = vg;
5457 
5458     /* With AH=0, use negx; with AH=1 use negf. */
5459     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5460     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5461     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5462     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5463 
5464     do {
5465         uint64_t pg = g[(i - 1) >> 6];
5466         do {
5467             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5468 
5469             /* I holds the real index; J holds the imag index.  */
5470             j = i - sizeof(float64);
5471             i -= 2 * sizeof(float64);
5472 
5473             nr = *(float64 *)(vn + H1_2(i));
5474             ni = *(float64 *)(vn + H1_2(j));
5475             mr = *(float64 *)(vm + H1_2(i));
5476             mi = *(float64 *)(vm + H1_2(j));
5477 
5478             e2 = (flip ? ni : nr);
5479             e1 = (flip ? mi : mr) ^ negx_real;
5480             e4 = e2;
5481             e3 = (flip ? mr : mi) ^ negx_imag;
5482 
5483             if (likely((pg >> (i & 63)) & 1)) {
5484                 d = *(float64 *)(va + H1_2(i));
5485                 d = float64_muladd(e2, e1, d, negf_real, status);
5486                 *(float64 *)(vd + H1_2(i)) = d;
5487             }
5488             if (likely((pg >> (j & 63)) & 1)) {
5489                 d = *(float64 *)(va + H1_2(j));
5490                 d = float64_muladd(e4, e3, d, negf_imag, status);
5491                 *(float64 *)(vd + H1_2(j)) = d;
5492             }
5493         } while (i & 63);
5494     } while (i != 0);
5495 }
5496 
5497 /*
5498  * Load contiguous data, protected by a governing predicate.
5499  */
5500 
5501 /*
5502  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5503  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5504  * element >= @reg_off, or @reg_max if there were no active elements at all.
5505  */
5506 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5507                                  intptr_t reg_max, int esz)
5508 {
5509     uint64_t pg_mask = pred_esz_masks[esz];
5510     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5511 
5512     /* In normal usage, the first element is active.  */
5513     if (likely(pg & 1)) {
5514         return reg_off;
5515     }
5516 
5517     if (pg == 0) {
5518         reg_off &= -64;
5519         do {
5520             reg_off += 64;
5521             if (unlikely(reg_off >= reg_max)) {
5522                 /* The entire predicate was false.  */
5523                 return reg_max;
5524             }
5525             pg = vg[reg_off >> 6] & pg_mask;
5526         } while (pg == 0);
5527     }
5528     reg_off += ctz64(pg);
5529 
5530     /* We should never see an out of range predicate bit set.  */
5531     tcg_debug_assert(reg_off < reg_max);
5532     return reg_off;
5533 }
5534 
5535 /*
5536  * Resolve the guest virtual address to info->host and info->flags.
5537  * If @nofault, return false if the page is invalid, otherwise
5538  * exit via page fault exception.
5539  */
5540 
5541 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5542                     target_ulong addr, int mem_off, MMUAccessType access_type,
5543                     int mmu_idx, uintptr_t retaddr)
5544 {
5545     int flags;
5546 
5547     addr += mem_off;
5548 
5549     /*
5550      * User-only currently always issues with TBI.  See the comment
5551      * above useronly_clean_ptr.  Usually we clean this top byte away
5552      * during translation, but we can't do that for e.g. vector + imm
5553      * addressing modes.
5554      *
5555      * We currently always enable TBI for user-only, and do not provide
5556      * a way to turn it off.  So clean the pointer unconditionally here,
5557      * rather than look it up here, or pass it down from above.
5558      */
5559     addr = useronly_clean_ptr(addr);
5560 
5561 #ifdef CONFIG_USER_ONLY
5562     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5563                                &info->host, retaddr);
5564 #else
5565     CPUTLBEntryFull *full;
5566     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5567                               &info->host, &full, retaddr);
5568 #endif
5569     info->flags = flags;
5570 
5571     if (flags & TLB_INVALID_MASK) {
5572         g_assert(nofault);
5573         return false;
5574     }
5575 
5576 #ifdef CONFIG_USER_ONLY
5577     memset(&info->attrs, 0, sizeof(info->attrs));
5578     /* Require both ANON and MTE; see allocation_tag_mem(). */
5579     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5580 #else
5581     info->attrs = full->attrs;
5582     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5583 #endif
5584 
5585     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5586     info->host -= mem_off;
5587     return true;
5588 }
5589 
5590 /*
5591  * Find first active element on each page, and a loose bound for the
5592  * final element on each page.  Identify any single element that spans
5593  * the page boundary.  Return true if there are any active elements.
5594  */
5595 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5596                             intptr_t reg_max, int esz, int msize)
5597 {
5598     const int esize = 1 << esz;
5599     const uint64_t pg_mask = pred_esz_masks[esz];
5600     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5601     intptr_t mem_off_last, mem_off_split;
5602     intptr_t page_split, elt_split;
5603     intptr_t i;
5604 
5605     /* Set all of the element indices to -1, and the TLB data to 0. */
5606     memset(info, -1, offsetof(SVEContLdSt, page));
5607     memset(info->page, 0, sizeof(info->page));
5608 
5609     /* Gross scan over the entire predicate to find bounds. */
5610     i = 0;
5611     do {
5612         uint64_t pg = vg[i] & pg_mask;
5613         if (pg) {
5614             reg_off_last = i * 64 + 63 - clz64(pg);
5615             if (reg_off_first < 0) {
5616                 reg_off_first = i * 64 + ctz64(pg);
5617             }
5618         }
5619     } while (++i * 64 < reg_max);
5620 
5621     if (unlikely(reg_off_first < 0)) {
5622         /* No active elements, no pages touched. */
5623         return false;
5624     }
5625     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5626 
5627     info->reg_off_first[0] = reg_off_first;
5628     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5629     mem_off_last = (reg_off_last >> esz) * msize;
5630 
5631     page_split = -(addr | TARGET_PAGE_MASK);
5632     if (likely(mem_off_last + msize <= page_split)) {
5633         /* The entire operation fits within a single page. */
5634         info->reg_off_last[0] = reg_off_last;
5635         return true;
5636     }
5637 
5638     info->page_split = page_split;
5639     elt_split = page_split / msize;
5640     reg_off_split = elt_split << esz;
5641     mem_off_split = elt_split * msize;
5642 
5643     /*
5644      * This is the last full element on the first page, but it is not
5645      * necessarily active.  If there is no full element, i.e. the first
5646      * active element is the one that's split, this value remains -1.
5647      * It is useful as iteration bounds.
5648      */
5649     if (elt_split != 0) {
5650         info->reg_off_last[0] = reg_off_split - esize;
5651     }
5652 
5653     /* Determine if an unaligned element spans the pages.  */
5654     if (page_split % msize != 0) {
5655         /* It is helpful to know if the split element is active. */
5656         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5657             info->reg_off_split = reg_off_split;
5658             info->mem_off_split = mem_off_split;
5659 
5660             if (reg_off_split == reg_off_last) {
5661                 /* The page crossing element is last. */
5662                 return true;
5663             }
5664         }
5665         reg_off_split += esize;
5666         mem_off_split += msize;
5667     }
5668 
5669     /*
5670      * We do want the first active element on the second page, because
5671      * this may affect the address reported in an exception.
5672      */
5673     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5674     tcg_debug_assert(reg_off_split <= reg_off_last);
5675     info->reg_off_first[1] = reg_off_split;
5676     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5677     info->reg_off_last[1] = reg_off_last;
5678     return true;
5679 }
5680 
5681 /*
5682  * Resolve the guest virtual addresses to info->page[].
5683  * Control the generation of page faults with @fault.  Return false if
5684  * there is no work to do, which can only happen with @fault == FAULT_NO.
5685  */
5686 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5687                          CPUARMState *env, target_ulong addr,
5688                          MMUAccessType access_type, uintptr_t retaddr)
5689 {
5690     int mmu_idx = arm_env_mmu_index(env);
5691     int mem_off = info->mem_off_first[0];
5692     bool nofault = fault == FAULT_NO;
5693     bool have_work = true;
5694 
5695     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5696                         access_type, mmu_idx, retaddr)) {
5697         /* No work to be done. */
5698         return false;
5699     }
5700 
5701     if (likely(info->page_split < 0)) {
5702         /* The entire operation was on the one page. */
5703         return true;
5704     }
5705 
5706     /*
5707      * If the second page is invalid, then we want the fault address to be
5708      * the first byte on that page which is accessed.
5709      */
5710     if (info->mem_off_split >= 0) {
5711         /*
5712          * There is an element split across the pages.  The fault address
5713          * should be the first byte of the second page.
5714          */
5715         mem_off = info->page_split;
5716         /*
5717          * If the split element is also the first active element
5718          * of the vector, then:  For first-fault we should continue
5719          * to generate faults for the second page.  For no-fault,
5720          * we have work only if the second page is valid.
5721          */
5722         if (info->mem_off_first[0] < info->mem_off_split) {
5723             nofault = FAULT_FIRST;
5724             have_work = false;
5725         }
5726     } else {
5727         /*
5728          * There is no element split across the pages.  The fault address
5729          * should be the first active element on the second page.
5730          */
5731         mem_off = info->mem_off_first[1];
5732         /*
5733          * There must have been one active element on the first page,
5734          * so we're out of first-fault territory.
5735          */
5736         nofault = fault != FAULT_ALL;
5737     }
5738 
5739     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5740                                 access_type, mmu_idx, retaddr);
5741     return have_work;
5742 }
5743 
5744 #ifndef CONFIG_USER_ONLY
5745 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5746                                uint64_t *vg, target_ulong addr,
5747                                int esize, int msize, int wp_access,
5748                                uintptr_t retaddr)
5749 {
5750     intptr_t mem_off, reg_off, reg_last;
5751     int flags0 = info->page[0].flags;
5752     int flags1 = info->page[1].flags;
5753 
5754     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5755         return;
5756     }
5757 
5758     /* Indicate that watchpoints are handled. */
5759     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5760     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5761 
5762     if (flags0 & TLB_WATCHPOINT) {
5763         mem_off = info->mem_off_first[0];
5764         reg_off = info->reg_off_first[0];
5765         reg_last = info->reg_off_last[0];
5766 
5767         while (reg_off <= reg_last) {
5768             uint64_t pg = vg[reg_off >> 6];
5769             do {
5770                 if ((pg >> (reg_off & 63)) & 1) {
5771                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5772                                          msize, info->page[0].attrs,
5773                                          wp_access, retaddr);
5774                 }
5775                 reg_off += esize;
5776                 mem_off += msize;
5777             } while (reg_off <= reg_last && (reg_off & 63));
5778         }
5779     }
5780 
5781     mem_off = info->mem_off_split;
5782     if (mem_off >= 0) {
5783         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5784                              info->page[0].attrs, wp_access, retaddr);
5785     }
5786 
5787     mem_off = info->mem_off_first[1];
5788     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5789         reg_off = info->reg_off_first[1];
5790         reg_last = info->reg_off_last[1];
5791 
5792         do {
5793             uint64_t pg = vg[reg_off >> 6];
5794             do {
5795                 if ((pg >> (reg_off & 63)) & 1) {
5796                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5797                                          msize, info->page[1].attrs,
5798                                          wp_access, retaddr);
5799                 }
5800                 reg_off += esize;
5801                 mem_off += msize;
5802             } while (reg_off & 63);
5803         } while (reg_off <= reg_last);
5804     }
5805 }
5806 #endif
5807 
5808 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5809                              uint64_t *vg, target_ulong addr, int esize,
5810                              int msize, uint32_t mtedesc, uintptr_t ra)
5811 {
5812     intptr_t mem_off, reg_off, reg_last;
5813 
5814     /* Process the page only if MemAttr == Tagged. */
5815     if (info->page[0].tagged) {
5816         mem_off = info->mem_off_first[0];
5817         reg_off = info->reg_off_first[0];
5818         reg_last = info->reg_off_split;
5819         if (reg_last < 0) {
5820             reg_last = info->reg_off_last[0];
5821         }
5822 
5823         do {
5824             uint64_t pg = vg[reg_off >> 6];
5825             do {
5826                 if ((pg >> (reg_off & 63)) & 1) {
5827                     mte_check(env, mtedesc, addr, ra);
5828                 }
5829                 reg_off += esize;
5830                 mem_off += msize;
5831             } while (reg_off <= reg_last && (reg_off & 63));
5832         } while (reg_off <= reg_last);
5833     }
5834 
5835     mem_off = info->mem_off_first[1];
5836     if (mem_off >= 0 && info->page[1].tagged) {
5837         reg_off = info->reg_off_first[1];
5838         reg_last = info->reg_off_last[1];
5839 
5840         do {
5841             uint64_t pg = vg[reg_off >> 6];
5842             do {
5843                 if ((pg >> (reg_off & 63)) & 1) {
5844                     mte_check(env, mtedesc, addr, ra);
5845                 }
5846                 reg_off += esize;
5847                 mem_off += msize;
5848             } while (reg_off & 63);
5849         } while (reg_off <= reg_last);
5850     }
5851 }
5852 
5853 /*
5854  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5855  */
5856 static inline QEMU_ALWAYS_INLINE
5857 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5858                uint32_t desc, const uintptr_t retaddr,
5859                const int esz, const int msz, const int N, uint32_t mtedesc,
5860                sve_ldst1_host_fn *host_fn,
5861                sve_ldst1_tlb_fn *tlb_fn)
5862 {
5863     const unsigned rd = simd_data(desc);
5864     const intptr_t reg_max = simd_oprsz(desc);
5865     intptr_t reg_off, reg_last, mem_off;
5866     SVEContLdSt info;
5867     void *host;
5868     int flags, i;
5869 
5870     /* Find the active elements.  */
5871     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5872         /* The entire predicate was false; no load occurs.  */
5873         for (i = 0; i < N; ++i) {
5874             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5875         }
5876         return;
5877     }
5878 
5879     /* Probe the page(s).  Exit with exception for any invalid page. */
5880     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5881 
5882     /* Handle watchpoints for all active elements. */
5883     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5884                               BP_MEM_READ, retaddr);
5885 
5886     /*
5887      * Handle mte checks for all active elements.
5888      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5889      */
5890     if (mtedesc) {
5891         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5892                                 mtedesc, retaddr);
5893     }
5894 
5895     flags = info.page[0].flags | info.page[1].flags;
5896     if (unlikely(flags != 0)) {
5897         /*
5898          * At least one page includes MMIO.
5899          * Any bus operation can fail with cpu_transaction_failed,
5900          * which for ARM will raise SyncExternal.  Perform the load
5901          * into scratch memory to preserve register state until the end.
5902          */
5903         ARMVectorReg scratch[4] = { };
5904 
5905         mem_off = info.mem_off_first[0];
5906         reg_off = info.reg_off_first[0];
5907         reg_last = info.reg_off_last[1];
5908         if (reg_last < 0) {
5909             reg_last = info.reg_off_split;
5910             if (reg_last < 0) {
5911                 reg_last = info.reg_off_last[0];
5912             }
5913         }
5914 
5915         do {
5916             uint64_t pg = vg[reg_off >> 6];
5917             do {
5918                 if ((pg >> (reg_off & 63)) & 1) {
5919                     for (i = 0; i < N; ++i) {
5920                         tlb_fn(env, &scratch[i], reg_off,
5921                                addr + mem_off + (i << msz), retaddr);
5922                     }
5923                 }
5924                 reg_off += 1 << esz;
5925                 mem_off += N << msz;
5926             } while (reg_off & 63);
5927         } while (reg_off <= reg_last);
5928 
5929         for (i = 0; i < N; ++i) {
5930             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5931         }
5932         return;
5933     }
5934 
5935     /* The entire operation is in RAM, on valid pages. */
5936 
5937     for (i = 0; i < N; ++i) {
5938         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5939     }
5940 
5941     mem_off = info.mem_off_first[0];
5942     reg_off = info.reg_off_first[0];
5943     reg_last = info.reg_off_last[0];
5944     host = info.page[0].host;
5945 
5946     set_helper_retaddr(retaddr);
5947 
5948     while (reg_off <= reg_last) {
5949         uint64_t pg = vg[reg_off >> 6];
5950         do {
5951             if ((pg >> (reg_off & 63)) & 1) {
5952                 for (i = 0; i < N; ++i) {
5953                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5954                             host + mem_off + (i << msz));
5955                 }
5956             }
5957             reg_off += 1 << esz;
5958             mem_off += N << msz;
5959         } while (reg_off <= reg_last && (reg_off & 63));
5960     }
5961 
5962     clear_helper_retaddr();
5963 
5964     /*
5965      * Use the slow path to manage the cross-page misalignment.
5966      * But we know this is RAM and cannot trap.
5967      */
5968     mem_off = info.mem_off_split;
5969     if (unlikely(mem_off >= 0)) {
5970         reg_off = info.reg_off_split;
5971         for (i = 0; i < N; ++i) {
5972             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5973                    addr + mem_off + (i << msz), retaddr);
5974         }
5975     }
5976 
5977     mem_off = info.mem_off_first[1];
5978     if (unlikely(mem_off >= 0)) {
5979         reg_off = info.reg_off_first[1];
5980         reg_last = info.reg_off_last[1];
5981         host = info.page[1].host;
5982 
5983         set_helper_retaddr(retaddr);
5984 
5985         do {
5986             uint64_t pg = vg[reg_off >> 6];
5987             do {
5988                 if ((pg >> (reg_off & 63)) & 1) {
5989                     for (i = 0; i < N; ++i) {
5990                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5991                                 host + mem_off + (i << msz));
5992                     }
5993                 }
5994                 reg_off += 1 << esz;
5995                 mem_off += N << msz;
5996             } while (reg_off & 63);
5997         } while (reg_off <= reg_last);
5998 
5999         clear_helper_retaddr();
6000     }
6001 }
6002 
6003 static inline QEMU_ALWAYS_INLINE
6004 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6005                    uint32_t desc, const uintptr_t ra,
6006                    const int esz, const int msz, const int N,
6007                    sve_ldst1_host_fn *host_fn,
6008                    sve_ldst1_tlb_fn *tlb_fn)
6009 {
6010     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6011     int bit55 = extract64(addr, 55, 1);
6012 
6013     /* Remove mtedesc from the normal sve descriptor. */
6014     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6015 
6016     /* Perform gross MTE suppression early. */
6017     if (!tbi_check(mtedesc, bit55) ||
6018         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6019         mtedesc = 0;
6020     }
6021 
6022     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6023 }
6024 
6025 #define DO_LD1_1(NAME, ESZ)                                             \
6026 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
6027                             target_ulong addr, uint32_t desc)           \
6028 {                                                                       \
6029     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
6030               sve_##NAME##_host, sve_##NAME##_tlb);                     \
6031 }                                                                       \
6032 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
6033                                 target_ulong addr, uint32_t desc)       \
6034 {                                                                       \
6035     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
6036                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
6037 }
6038 
6039 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
6040 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
6041                                target_ulong addr, uint32_t desc)        \
6042 {                                                                       \
6043     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6044               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
6045 }                                                                       \
6046 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
6047                                target_ulong addr, uint32_t desc)        \
6048 {                                                                       \
6049     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6050               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
6051 }                                                                       \
6052 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
6053                                    target_ulong addr, uint32_t desc)    \
6054 {                                                                       \
6055     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6056                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
6057 }                                                                       \
6058 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
6059                                    target_ulong addr, uint32_t desc)    \
6060 {                                                                       \
6061     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6062                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
6063 }
6064 
6065 DO_LD1_1(ld1bb,  MO_8)
6066 DO_LD1_1(ld1bhu, MO_16)
6067 DO_LD1_1(ld1bhs, MO_16)
6068 DO_LD1_1(ld1bsu, MO_32)
6069 DO_LD1_1(ld1bss, MO_32)
6070 DO_LD1_1(ld1bdu, MO_64)
6071 DO_LD1_1(ld1bds, MO_64)
6072 
6073 DO_LD1_2(ld1hh,  MO_16, MO_16)
6074 DO_LD1_2(ld1hsu, MO_32, MO_16)
6075 DO_LD1_2(ld1hss, MO_32, MO_16)
6076 DO_LD1_2(ld1hdu, MO_64, MO_16)
6077 DO_LD1_2(ld1hds, MO_64, MO_16)
6078 
6079 DO_LD1_2(ld1ss,  MO_32, MO_32)
6080 DO_LD1_2(ld1sdu, MO_64, MO_32)
6081 DO_LD1_2(ld1sds, MO_64, MO_32)
6082 
6083 DO_LD1_2(ld1dd,  MO_64, MO_64)
6084 
6085 #undef DO_LD1_1
6086 #undef DO_LD1_2
6087 
6088 #define DO_LDN_1(N)                                                     \
6089 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
6090                              target_ulong addr, uint32_t desc)          \
6091 {                                                                       \
6092     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
6093               sve_ld1bb_host, sve_ld1bb_tlb);                           \
6094 }                                                                       \
6095 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
6096                                  target_ulong addr, uint32_t desc)      \
6097 {                                                                       \
6098     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
6099                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
6100 }
6101 
6102 #define DO_LDN_2(N, SUFF, ESZ)                                          \
6103 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
6104                                     target_ulong addr, uint32_t desc)   \
6105 {                                                                       \
6106     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6107               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
6108 }                                                                       \
6109 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
6110                                     target_ulong addr, uint32_t desc)   \
6111 {                                                                       \
6112     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6113               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
6114 }                                                                       \
6115 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
6116                                         target_ulong addr, uint32_t desc) \
6117 {                                                                       \
6118     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6119                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
6120 }                                                                       \
6121 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
6122                                         target_ulong addr, uint32_t desc) \
6123 {                                                                       \
6124     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6125                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
6126 }
6127 
6128 DO_LDN_1(2)
6129 DO_LDN_1(3)
6130 DO_LDN_1(4)
6131 
6132 DO_LDN_2(2, hh, MO_16)
6133 DO_LDN_2(3, hh, MO_16)
6134 DO_LDN_2(4, hh, MO_16)
6135 
6136 DO_LDN_2(2, ss, MO_32)
6137 DO_LDN_2(3, ss, MO_32)
6138 DO_LDN_2(4, ss, MO_32)
6139 
6140 DO_LDN_2(2, dd, MO_64)
6141 DO_LDN_2(3, dd, MO_64)
6142 DO_LDN_2(4, dd, MO_64)
6143 
6144 #undef DO_LDN_1
6145 #undef DO_LDN_2
6146 
6147 /*
6148  * Load contiguous data, first-fault and no-fault.
6149  *
6150  * For user-only, we control the race between page_check_range and
6151  * another thread's munmap by using set/clear_helper_retaddr.  Any
6152  * SEGV that occurs between those markers is assumed to be because
6153  * the guest page vanished.  Keep that block as small as possible
6154  * so that unrelated QEMU bugs are not blamed on the guest.
6155  */
6156 
6157 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
6158  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6159  * option, which leaves subsequent data unchanged.
6160  */
6161 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6162 {
6163     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6164 
6165     if (i & 63) {
6166         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6167         i = ROUND_UP(i, 64);
6168     }
6169     for (; i < oprsz; i += 64) {
6170         ffr[i / 64] = 0;
6171     }
6172 }
6173 
6174 /*
6175  * Common helper for all contiguous no-fault and first-fault loads.
6176  */
6177 static inline QEMU_ALWAYS_INLINE
6178 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6179                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6180                    const int esz, const int msz, const SVEContFault fault,
6181                    sve_ldst1_host_fn *host_fn,
6182                    sve_ldst1_tlb_fn *tlb_fn)
6183 {
6184     const unsigned rd = simd_data(desc);
6185     void *vd = &env->vfp.zregs[rd];
6186     const intptr_t reg_max = simd_oprsz(desc);
6187     intptr_t reg_off, mem_off, reg_last;
6188     SVEContLdSt info;
6189     int flags;
6190     void *host;
6191 
6192     /* Find the active elements.  */
6193     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6194         /* The entire predicate was false; no load occurs.  */
6195         memset(vd, 0, reg_max);
6196         return;
6197     }
6198     reg_off = info.reg_off_first[0];
6199 
6200     /* Probe the page(s). */
6201     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6202         /* Fault on first element. */
6203         tcg_debug_assert(fault == FAULT_NO);
6204         memset(vd, 0, reg_max);
6205         goto do_fault;
6206     }
6207 
6208     mem_off = info.mem_off_first[0];
6209     flags = info.page[0].flags;
6210 
6211     /*
6212      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6213      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6214      */
6215     if (!info.page[0].tagged) {
6216         mtedesc = 0;
6217     }
6218 
6219     if (fault == FAULT_FIRST) {
6220         /* Trapping mte check for the first-fault element.  */
6221         if (mtedesc) {
6222             mte_check(env, mtedesc, addr + mem_off, retaddr);
6223         }
6224 
6225         /*
6226          * Special handling of the first active element,
6227          * if it crosses a page boundary or is MMIO.
6228          */
6229         bool is_split = mem_off == info.mem_off_split;
6230         if (unlikely(flags != 0) || unlikely(is_split)) {
6231             /*
6232              * Use the slow path for cross-page handling.
6233              * Might trap for MMIO or watchpoints.
6234              */
6235             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6236 
6237             /* After any fault, zero the other elements. */
6238             swap_memzero(vd, reg_off);
6239             reg_off += 1 << esz;
6240             mem_off += 1 << msz;
6241             swap_memzero(vd + reg_off, reg_max - reg_off);
6242 
6243             if (is_split) {
6244                 goto second_page;
6245             }
6246         } else {
6247             memset(vd, 0, reg_max);
6248         }
6249     } else {
6250         memset(vd, 0, reg_max);
6251         if (unlikely(mem_off == info.mem_off_split)) {
6252             /* The first active element crosses a page boundary. */
6253             flags |= info.page[1].flags;
6254             if (unlikely(flags & TLB_MMIO)) {
6255                 /* Some page is MMIO, see below. */
6256                 goto do_fault;
6257             }
6258             if (unlikely(flags & TLB_WATCHPOINT) &&
6259                 (cpu_watchpoint_address_matches
6260                  (env_cpu(env), addr + mem_off, 1 << msz)
6261                  & BP_MEM_READ)) {
6262                 /* Watchpoint hit, see below. */
6263                 goto do_fault;
6264             }
6265             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6266                 goto do_fault;
6267             }
6268             /*
6269              * Use the slow path for cross-page handling.
6270              * This is RAM, without a watchpoint, and will not trap.
6271              */
6272             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6273             goto second_page;
6274         }
6275     }
6276 
6277     /*
6278      * From this point on, all memory operations are MemSingleNF.
6279      *
6280      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6281      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6282      *
6283      * Unfortuately we do not have access to the memory attributes from the
6284      * PTE to tell Device memory from Normal memory.  So we make a mostly
6285      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6286      * This gives the right answer for the common cases of "Normal memory,
6287      * backed by host RAM" and "Device memory, backed by MMIO".
6288      * The architecture allows us to suppress an NF load and return
6289      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6290      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6291      * get wrong is "Device memory, backed by host RAM", for which we
6292      * should return (UNKNOWN, FAULT) for but do not.
6293      *
6294      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6295      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6296      * architectural breakpoints the same.
6297      */
6298     if (unlikely(flags & TLB_MMIO)) {
6299         goto do_fault;
6300     }
6301 
6302     reg_last = info.reg_off_last[0];
6303     host = info.page[0].host;
6304 
6305     set_helper_retaddr(retaddr);
6306 
6307     do {
6308         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6309         do {
6310             if ((pg >> (reg_off & 63)) & 1) {
6311                 if (unlikely(flags & TLB_WATCHPOINT) &&
6312                     (cpu_watchpoint_address_matches
6313                      (env_cpu(env), addr + mem_off, 1 << msz)
6314                      & BP_MEM_READ)) {
6315                     clear_helper_retaddr();
6316                     goto do_fault;
6317                 }
6318                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6319                     clear_helper_retaddr();
6320                     goto do_fault;
6321                 }
6322                 host_fn(vd, reg_off, host + mem_off);
6323             }
6324             reg_off += 1 << esz;
6325             mem_off += 1 << msz;
6326         } while (reg_off <= reg_last && (reg_off & 63));
6327     } while (reg_off <= reg_last);
6328 
6329     clear_helper_retaddr();
6330 
6331     /*
6332      * MemSingleNF is allowed to fail for any reason.  We have special
6333      * code above to handle the first element crossing a page boundary.
6334      * As an implementation choice, decline to handle a cross-page element
6335      * in any other position.
6336      */
6337     reg_off = info.reg_off_split;
6338     if (reg_off >= 0) {
6339         goto do_fault;
6340     }
6341 
6342  second_page:
6343     reg_off = info.reg_off_first[1];
6344     if (likely(reg_off < 0)) {
6345         /* No active elements on the second page.  All done. */
6346         return;
6347     }
6348 
6349     /*
6350      * MemSingleNF is allowed to fail for any reason.  As an implementation
6351      * choice, decline to handle elements on the second page.  This should
6352      * be low frequency as the guest walks through memory -- the next
6353      * iteration of the guest's loop should be aligned on the page boundary,
6354      * and then all following iterations will stay aligned.
6355      */
6356 
6357  do_fault:
6358     record_fault(env, reg_off, reg_max);
6359 }
6360 
6361 static inline QEMU_ALWAYS_INLINE
6362 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6363                        uint32_t desc, const uintptr_t retaddr,
6364                        const int esz, const int msz, const SVEContFault fault,
6365                        sve_ldst1_host_fn *host_fn,
6366                        sve_ldst1_tlb_fn *tlb_fn)
6367 {
6368     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6369     int bit55 = extract64(addr, 55, 1);
6370 
6371     /* Remove mtedesc from the normal sve descriptor. */
6372     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6373 
6374     /* Perform gross MTE suppression early. */
6375     if (!tbi_check(mtedesc, bit55) ||
6376         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6377         mtedesc = 0;
6378     }
6379 
6380     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6381                   esz, msz, fault, host_fn, tlb_fn);
6382 }
6383 
6384 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6385 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6386                                  target_ulong addr, uint32_t desc)      \
6387 {                                                                       \
6388     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6389                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6390 }                                                                       \
6391 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6392                                  target_ulong addr, uint32_t desc)      \
6393 {                                                                       \
6394     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6395                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6396 }                                                                       \
6397 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6398                                      target_ulong addr, uint32_t desc)  \
6399 {                                                                       \
6400     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6401                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6402 }                                                                       \
6403 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6404                                      target_ulong addr, uint32_t desc)  \
6405 {                                                                       \
6406     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6407                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6408 }
6409 
6410 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6411 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6412                                     target_ulong addr, uint32_t desc)   \
6413 {                                                                       \
6414     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6415                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6416 }                                                                       \
6417 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6418                                     target_ulong addr, uint32_t desc)   \
6419 {                                                                       \
6420     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6421                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6422 }                                                                       \
6423 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6424                                     target_ulong addr, uint32_t desc)   \
6425 {                                                                       \
6426     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6427                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6428 }                                                                       \
6429 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6430                                     target_ulong addr, uint32_t desc)   \
6431 {                                                                       \
6432     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6433                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6434 }                                                                       \
6435 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6436                                         target_ulong addr, uint32_t desc) \
6437 {                                                                       \
6438     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6439                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6440 }                                                                       \
6441 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6442                                         target_ulong addr, uint32_t desc) \
6443 {                                                                       \
6444     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6445                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6446 }                                                                       \
6447 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6448                                         target_ulong addr, uint32_t desc) \
6449 {                                                                       \
6450     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6451                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6452 }                                                                       \
6453 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6454                                         target_ulong addr, uint32_t desc) \
6455 {                                                                       \
6456     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6457                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6458 }
6459 
6460 DO_LDFF1_LDNF1_1(bb,  MO_8)
6461 DO_LDFF1_LDNF1_1(bhu, MO_16)
6462 DO_LDFF1_LDNF1_1(bhs, MO_16)
6463 DO_LDFF1_LDNF1_1(bsu, MO_32)
6464 DO_LDFF1_LDNF1_1(bss, MO_32)
6465 DO_LDFF1_LDNF1_1(bdu, MO_64)
6466 DO_LDFF1_LDNF1_1(bds, MO_64)
6467 
6468 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6469 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6470 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6471 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6472 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6473 
6474 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6475 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6476 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6477 
6478 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6479 
6480 #undef DO_LDFF1_LDNF1_1
6481 #undef DO_LDFF1_LDNF1_2
6482 
6483 /*
6484  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6485  */
6486 
6487 static inline QEMU_ALWAYS_INLINE
6488 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6489                uint32_t desc, const uintptr_t retaddr,
6490                const int esz, const int msz, const int N, uint32_t mtedesc,
6491                sve_ldst1_host_fn *host_fn,
6492                sve_ldst1_tlb_fn *tlb_fn)
6493 {
6494     const unsigned rd = simd_data(desc);
6495     const intptr_t reg_max = simd_oprsz(desc);
6496     intptr_t reg_off, reg_last, mem_off;
6497     SVEContLdSt info;
6498     void *host;
6499     int i, flags;
6500 
6501     /* Find the active elements.  */
6502     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6503         /* The entire predicate was false; no store occurs.  */
6504         return;
6505     }
6506 
6507     /* Probe the page(s).  Exit with exception for any invalid page. */
6508     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6509 
6510     /* Handle watchpoints for all active elements. */
6511     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6512                               BP_MEM_WRITE, retaddr);
6513 
6514     /*
6515      * Handle mte checks for all active elements.
6516      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6517      */
6518     if (mtedesc) {
6519         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6520                                 mtedesc, retaddr);
6521     }
6522 
6523     flags = info.page[0].flags | info.page[1].flags;
6524     if (unlikely(flags != 0)) {
6525         /*
6526          * At least one page includes MMIO.
6527          * Any bus operation can fail with cpu_transaction_failed,
6528          * which for ARM will raise SyncExternal.  We cannot avoid
6529          * this fault and will leave with the store incomplete.
6530          */
6531         mem_off = info.mem_off_first[0];
6532         reg_off = info.reg_off_first[0];
6533         reg_last = info.reg_off_last[1];
6534         if (reg_last < 0) {
6535             reg_last = info.reg_off_split;
6536             if (reg_last < 0) {
6537                 reg_last = info.reg_off_last[0];
6538             }
6539         }
6540 
6541         do {
6542             uint64_t pg = vg[reg_off >> 6];
6543             do {
6544                 if ((pg >> (reg_off & 63)) & 1) {
6545                     for (i = 0; i < N; ++i) {
6546                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6547                                addr + mem_off + (i << msz), retaddr);
6548                     }
6549                 }
6550                 reg_off += 1 << esz;
6551                 mem_off += N << msz;
6552             } while (reg_off & 63);
6553         } while (reg_off <= reg_last);
6554         return;
6555     }
6556 
6557     mem_off = info.mem_off_first[0];
6558     reg_off = info.reg_off_first[0];
6559     reg_last = info.reg_off_last[0];
6560     host = info.page[0].host;
6561 
6562     set_helper_retaddr(retaddr);
6563 
6564     while (reg_off <= reg_last) {
6565         uint64_t pg = vg[reg_off >> 6];
6566         do {
6567             if ((pg >> (reg_off & 63)) & 1) {
6568                 for (i = 0; i < N; ++i) {
6569                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6570                             host + mem_off + (i << msz));
6571                 }
6572             }
6573             reg_off += 1 << esz;
6574             mem_off += N << msz;
6575         } while (reg_off <= reg_last && (reg_off & 63));
6576     }
6577 
6578     clear_helper_retaddr();
6579 
6580     /*
6581      * Use the slow path to manage the cross-page misalignment.
6582      * But we know this is RAM and cannot trap.
6583      */
6584     mem_off = info.mem_off_split;
6585     if (unlikely(mem_off >= 0)) {
6586         reg_off = info.reg_off_split;
6587         for (i = 0; i < N; ++i) {
6588             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6589                    addr + mem_off + (i << msz), retaddr);
6590         }
6591     }
6592 
6593     mem_off = info.mem_off_first[1];
6594     if (unlikely(mem_off >= 0)) {
6595         reg_off = info.reg_off_first[1];
6596         reg_last = info.reg_off_last[1];
6597         host = info.page[1].host;
6598 
6599         set_helper_retaddr(retaddr);
6600 
6601         do {
6602             uint64_t pg = vg[reg_off >> 6];
6603             do {
6604                 if ((pg >> (reg_off & 63)) & 1) {
6605                     for (i = 0; i < N; ++i) {
6606                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6607                                 host + mem_off + (i << msz));
6608                     }
6609                 }
6610                 reg_off += 1 << esz;
6611                 mem_off += N << msz;
6612             } while (reg_off & 63);
6613         } while (reg_off <= reg_last);
6614 
6615         clear_helper_retaddr();
6616     }
6617 }
6618 
6619 static inline QEMU_ALWAYS_INLINE
6620 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6621                    uint32_t desc, const uintptr_t ra,
6622                    const int esz, const int msz, const int N,
6623                    sve_ldst1_host_fn *host_fn,
6624                    sve_ldst1_tlb_fn *tlb_fn)
6625 {
6626     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6627     int bit55 = extract64(addr, 55, 1);
6628 
6629     /* Remove mtedesc from the normal sve descriptor. */
6630     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6631 
6632     /* Perform gross MTE suppression early. */
6633     if (!tbi_check(mtedesc, bit55) ||
6634         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6635         mtedesc = 0;
6636     }
6637 
6638     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6639 }
6640 
6641 #define DO_STN_1(N, NAME, ESZ)                                          \
6642 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6643                                  target_ulong addr, uint32_t desc)      \
6644 {                                                                       \
6645     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6646               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6647 }                                                                       \
6648 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6649                                      target_ulong addr, uint32_t desc)  \
6650 {                                                                       \
6651     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6652                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6653 }
6654 
6655 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6656 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6657                                     target_ulong addr, uint32_t desc)   \
6658 {                                                                       \
6659     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6660               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6661 }                                                                       \
6662 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6663                                     target_ulong addr, uint32_t desc)   \
6664 {                                                                       \
6665     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6666               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6667 }                                                                       \
6668 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6669                                         target_ulong addr, uint32_t desc) \
6670 {                                                                       \
6671     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6672                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6673 }                                                                       \
6674 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6675                                         target_ulong addr, uint32_t desc) \
6676 {                                                                       \
6677     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6678                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6679 }
6680 
6681 DO_STN_1(1, bb, MO_8)
6682 DO_STN_1(1, bh, MO_16)
6683 DO_STN_1(1, bs, MO_32)
6684 DO_STN_1(1, bd, MO_64)
6685 DO_STN_1(2, bb, MO_8)
6686 DO_STN_1(3, bb, MO_8)
6687 DO_STN_1(4, bb, MO_8)
6688 
6689 DO_STN_2(1, hh, MO_16, MO_16)
6690 DO_STN_2(1, hs, MO_32, MO_16)
6691 DO_STN_2(1, hd, MO_64, MO_16)
6692 DO_STN_2(2, hh, MO_16, MO_16)
6693 DO_STN_2(3, hh, MO_16, MO_16)
6694 DO_STN_2(4, hh, MO_16, MO_16)
6695 
6696 DO_STN_2(1, ss, MO_32, MO_32)
6697 DO_STN_2(1, sd, MO_64, MO_32)
6698 DO_STN_2(2, ss, MO_32, MO_32)
6699 DO_STN_2(3, ss, MO_32, MO_32)
6700 DO_STN_2(4, ss, MO_32, MO_32)
6701 
6702 DO_STN_2(1, dd, MO_64, MO_64)
6703 DO_STN_2(2, dd, MO_64, MO_64)
6704 DO_STN_2(3, dd, MO_64, MO_64)
6705 DO_STN_2(4, dd, MO_64, MO_64)
6706 
6707 #undef DO_STN_1
6708 #undef DO_STN_2
6709 
6710 /*
6711  * Loads with a vector index.
6712  */
6713 
6714 /*
6715  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6716  */
6717 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6718 
6719 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6720 {
6721     return *(uint32_t *)(reg + H1_4(reg_ofs));
6722 }
6723 
6724 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6725 {
6726     return *(int32_t *)(reg + H1_4(reg_ofs));
6727 }
6728 
6729 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6730 {
6731     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6732 }
6733 
6734 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6735 {
6736     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6737 }
6738 
6739 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6740 {
6741     return *(uint64_t *)(reg + reg_ofs);
6742 }
6743 
6744 static inline QEMU_ALWAYS_INLINE
6745 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6746                target_ulong base, uint32_t desc, uintptr_t retaddr,
6747                uint32_t mtedesc, int esize, int msize,
6748                zreg_off_fn *off_fn,
6749                sve_ldst1_host_fn *host_fn,
6750                sve_ldst1_tlb_fn *tlb_fn)
6751 {
6752     const int mmu_idx = arm_env_mmu_index(env);
6753     const intptr_t reg_max = simd_oprsz(desc);
6754     const int scale = simd_data(desc);
6755     ARMVectorReg scratch;
6756     intptr_t reg_off;
6757     SVEHostPage info, info2;
6758 
6759     memset(&scratch, 0, reg_max);
6760     reg_off = 0;
6761     do {
6762         uint64_t pg = vg[reg_off >> 6];
6763         do {
6764             if (likely(pg & 1)) {
6765                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6766                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6767 
6768                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6769                                mmu_idx, retaddr);
6770 
6771                 if (likely(in_page >= msize)) {
6772                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6773                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6774                                              info.attrs, BP_MEM_READ, retaddr);
6775                     }
6776                     if (mtedesc && info.tagged) {
6777                         mte_check(env, mtedesc, addr, retaddr);
6778                     }
6779                     if (unlikely(info.flags & TLB_MMIO)) {
6780                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6781                     } else {
6782                         set_helper_retaddr(retaddr);
6783                         host_fn(&scratch, reg_off, info.host);
6784                         clear_helper_retaddr();
6785                     }
6786                 } else {
6787                     /* Element crosses the page boundary. */
6788                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6789                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6790                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6791                         cpu_check_watchpoint(env_cpu(env), addr,
6792                                              msize, info.attrs,
6793                                              BP_MEM_READ, retaddr);
6794                     }
6795                     if (mtedesc && info.tagged) {
6796                         mte_check(env, mtedesc, addr, retaddr);
6797                     }
6798                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6799                 }
6800             }
6801             reg_off += esize;
6802             pg >>= esize;
6803         } while (reg_off & 63);
6804     } while (reg_off < reg_max);
6805 
6806     /* Wait until all exceptions have been raised to write back.  */
6807     memcpy(vd, &scratch, reg_max);
6808 }
6809 
6810 static inline QEMU_ALWAYS_INLINE
6811 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6812                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6813                    int esize, int msize, zreg_off_fn *off_fn,
6814                    sve_ldst1_host_fn *host_fn,
6815                    sve_ldst1_tlb_fn *tlb_fn)
6816 {
6817     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6818     /* Remove mtedesc from the normal sve descriptor. */
6819     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6820 
6821     /*
6822      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6823      * offset base entirely over the address space hole to change the
6824      * pointer tag, or change the bit55 selector.  So we could here
6825      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6826      */
6827     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6828               esize, msize, off_fn, host_fn, tlb_fn);
6829 }
6830 
6831 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6832 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6833                                  void *vm, target_ulong base, uint32_t desc) \
6834 {                                                                            \
6835     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6836               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6837 }                                                                            \
6838 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6839      void *vm, target_ulong base, uint32_t desc)                             \
6840 {                                                                            \
6841     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6842                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6843 }
6844 
6845 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6846 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6847                                  void *vm, target_ulong base, uint32_t desc) \
6848 {                                                                            \
6849     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6850               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6851 }                                                                            \
6852 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6853     void *vm, target_ulong base, uint32_t desc)                              \
6854 {                                                                            \
6855     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6856                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6857 }
6858 
6859 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6860 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6861 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6862 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6863 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6864 
6865 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6866 DO_LD1_ZPZ_S(bss, zss, MO_8)
6867 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6868 DO_LD1_ZPZ_D(bds, zss, MO_8)
6869 DO_LD1_ZPZ_D(bds, zd, MO_8)
6870 
6871 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6872 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6873 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6874 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6875 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6876 
6877 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6878 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6879 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6880 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6881 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6882 
6883 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6884 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6885 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6886 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6887 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6888 
6889 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6890 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6891 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6892 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6893 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6894 
6895 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6896 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6897 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6898 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6899 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6900 
6901 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6902 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6903 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6904 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6905 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6906 
6907 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6908 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6909 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6910 
6911 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6912 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6913 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6914 
6915 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6916 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6917 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6918 
6919 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6920 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6921 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6922 
6923 #undef DO_LD1_ZPZ_S
6924 #undef DO_LD1_ZPZ_D
6925 
6926 /* First fault loads with a vector index.  */
6927 
6928 /*
6929  * Common helpers for all gather first-faulting loads.
6930  */
6931 
6932 static inline QEMU_ALWAYS_INLINE
6933 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6934                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6935                  uint32_t mtedesc, const int esz, const int msz,
6936                  zreg_off_fn *off_fn,
6937                  sve_ldst1_host_fn *host_fn,
6938                  sve_ldst1_tlb_fn *tlb_fn)
6939 {
6940     const int mmu_idx = arm_env_mmu_index(env);
6941     const intptr_t reg_max = simd_oprsz(desc);
6942     const int scale = simd_data(desc);
6943     const int esize = 1 << esz;
6944     const int msize = 1 << msz;
6945     intptr_t reg_off;
6946     SVEHostPage info;
6947     target_ulong addr, in_page;
6948     ARMVectorReg scratch;
6949 
6950     /* Skip to the first true predicate.  */
6951     reg_off = find_next_active(vg, 0, reg_max, esz);
6952     if (unlikely(reg_off >= reg_max)) {
6953         /* The entire predicate was false; no load occurs.  */
6954         memset(vd, 0, reg_max);
6955         return;
6956     }
6957 
6958     /* Protect against overlap between vd and vm. */
6959     if (unlikely(vd == vm)) {
6960         vm = memcpy(&scratch, vm, reg_max);
6961     }
6962 
6963     /*
6964      * Probe the first element, allowing faults.
6965      */
6966     addr = base + (off_fn(vm, reg_off) << scale);
6967     if (mtedesc) {
6968         mte_check(env, mtedesc, addr, retaddr);
6969     }
6970     tlb_fn(env, vd, reg_off, addr, retaddr);
6971 
6972     /* After any fault, zero the other elements. */
6973     swap_memzero(vd, reg_off);
6974     reg_off += esize;
6975     swap_memzero(vd + reg_off, reg_max - reg_off);
6976 
6977     /*
6978      * Probe the remaining elements, not allowing faults.
6979      */
6980     while (reg_off < reg_max) {
6981         uint64_t pg = vg[reg_off >> 6];
6982         do {
6983             if (likely((pg >> (reg_off & 63)) & 1)) {
6984                 addr = base + (off_fn(vm, reg_off) << scale);
6985                 in_page = -(addr | TARGET_PAGE_MASK);
6986 
6987                 if (unlikely(in_page < msize)) {
6988                     /* Stop if the element crosses a page boundary. */
6989                     goto fault;
6990                 }
6991 
6992                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6993                                mmu_idx, retaddr);
6994                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6995                     goto fault;
6996                 }
6997                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6998                     (cpu_watchpoint_address_matches
6999                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7000                     goto fault;
7001                 }
7002                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7003                     goto fault;
7004                 }
7005 
7006                 set_helper_retaddr(retaddr);
7007                 host_fn(vd, reg_off, info.host);
7008                 clear_helper_retaddr();
7009             }
7010             reg_off += esize;
7011         } while (reg_off & 63);
7012     }
7013     return;
7014 
7015  fault:
7016     record_fault(env, reg_off, reg_max);
7017 }
7018 
7019 static inline QEMU_ALWAYS_INLINE
7020 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7021                      target_ulong base, uint32_t desc, uintptr_t retaddr,
7022                      const int esz, const int msz,
7023                      zreg_off_fn *off_fn,
7024                      sve_ldst1_host_fn *host_fn,
7025                      sve_ldst1_tlb_fn *tlb_fn)
7026 {
7027     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7028     /* Remove mtedesc from the normal sve descriptor. */
7029     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7030 
7031     /*
7032      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7033      * offset base entirely over the address space hole to change the
7034      * pointer tag, or change the bit55 selector.  So we could here
7035      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7036      */
7037     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7038                 esz, msz, off_fn, host_fn, tlb_fn);
7039 }
7040 
7041 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
7042 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7043     (CPUARMState *env, void *vd, void *vg,                              \
7044      void *vm, target_ulong base, uint32_t desc)                        \
7045 {                                                                       \
7046     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
7047                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7048 }                                                                       \
7049 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7050     (CPUARMState *env, void *vd, void *vg,                              \
7051      void *vm, target_ulong base, uint32_t desc)                        \
7052 {                                                                       \
7053     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
7054                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7055 }
7056 
7057 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
7058 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7059     (CPUARMState *env, void *vd, void *vg,                              \
7060      void *vm, target_ulong base, uint32_t desc)                        \
7061 {                                                                       \
7062     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
7063                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7064 }                                                                       \
7065 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7066     (CPUARMState *env, void *vd, void *vg,                              \
7067      void *vm, target_ulong base, uint32_t desc)                        \
7068 {                                                                       \
7069     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
7070                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7071 }
7072 
7073 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7074 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7075 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7076 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7077 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7078 
7079 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7080 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7081 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7082 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7083 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7084 
7085 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7086 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7087 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7088 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7089 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7090 
7091 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7092 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7093 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7094 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7095 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7096 
7097 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7098 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7099 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7100 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7101 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7102 
7103 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7104 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7105 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7106 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7107 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7108 
7109 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
7110 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
7111 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7112 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7113 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7114 
7115 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
7116 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
7117 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7118 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7119 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7120 
7121 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7122 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7123 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7124 
7125 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7126 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7127 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7128 
7129 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7130 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7131 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7132 
7133 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7134 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7135 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7136 
7137 /* Stores with a vector index.  */
7138 
7139 static inline QEMU_ALWAYS_INLINE
7140 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7141                target_ulong base, uint32_t desc, uintptr_t retaddr,
7142                uint32_t mtedesc, int esize, int msize,
7143                zreg_off_fn *off_fn,
7144                sve_ldst1_host_fn *host_fn,
7145                sve_ldst1_tlb_fn *tlb_fn)
7146 {
7147     const int mmu_idx = arm_env_mmu_index(env);
7148     const intptr_t reg_max = simd_oprsz(desc);
7149     const int scale = simd_data(desc);
7150     void *host[ARM_MAX_VQ * 4];
7151     intptr_t reg_off, i;
7152     SVEHostPage info, info2;
7153 
7154     /*
7155      * Probe all of the elements for host addresses and flags.
7156      */
7157     i = reg_off = 0;
7158     do {
7159         uint64_t pg = vg[reg_off >> 6];
7160         do {
7161             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7162             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7163 
7164             host[i] = NULL;
7165             if (likely((pg >> (reg_off & 63)) & 1)) {
7166                 if (likely(in_page >= msize)) {
7167                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7168                                    mmu_idx, retaddr);
7169                     if (!(info.flags & TLB_MMIO)) {
7170                         host[i] = info.host;
7171                     }
7172                 } else {
7173                     /*
7174                      * Element crosses the page boundary.
7175                      * Probe both pages, but do not record the host address,
7176                      * so that we use the slow path.
7177                      */
7178                     sve_probe_page(&info, false, env, addr, 0,
7179                                    MMU_DATA_STORE, mmu_idx, retaddr);
7180                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7181                                    MMU_DATA_STORE, mmu_idx, retaddr);
7182                     info.flags |= info2.flags;
7183                 }
7184 
7185                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7186                     cpu_check_watchpoint(env_cpu(env), addr, msize,
7187                                          info.attrs, BP_MEM_WRITE, retaddr);
7188                 }
7189 
7190                 if (mtedesc && info.tagged) {
7191                     mte_check(env, mtedesc, addr, retaddr);
7192                 }
7193             }
7194             i += 1;
7195             reg_off += esize;
7196         } while (reg_off & 63);
7197     } while (reg_off < reg_max);
7198 
7199     /*
7200      * Now that we have recognized all exceptions except SyncExternal
7201      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7202      *
7203      * Note for the common case of an element in RAM, not crossing a page
7204      * boundary, we have stored the host address in host[].  This doubles
7205      * as a first-level check against the predicate, since only enabled
7206      * elements have non-null host addresses.
7207      */
7208     i = reg_off = 0;
7209     do {
7210         void *h = host[i];
7211         if (likely(h != NULL)) {
7212             set_helper_retaddr(retaddr);
7213             host_fn(vd, reg_off, h);
7214             clear_helper_retaddr();
7215         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7216             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7217             tlb_fn(env, vd, reg_off, addr, retaddr);
7218         }
7219         i += 1;
7220         reg_off += esize;
7221     } while (reg_off < reg_max);
7222 }
7223 
7224 static inline QEMU_ALWAYS_INLINE
7225 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7226                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7227                    int esize, int msize, zreg_off_fn *off_fn,
7228                    sve_ldst1_host_fn *host_fn,
7229                    sve_ldst1_tlb_fn *tlb_fn)
7230 {
7231     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7232     /* Remove mtedesc from the normal sve descriptor. */
7233     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7234 
7235     /*
7236      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7237      * offset base entirely over the address space hole to change the
7238      * pointer tag, or change the bit55 selector.  So we could here
7239      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7240      */
7241     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7242               esize, msize, off_fn, host_fn, tlb_fn);
7243 }
7244 
7245 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7246 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7247                                  void *vm, target_ulong base, uint32_t desc) \
7248 {                                                                       \
7249     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7250               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7251 }                                                                       \
7252 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7253     void *vm, target_ulong base, uint32_t desc)                         \
7254 {                                                                       \
7255     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7256                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7257 }
7258 
7259 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7260 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7261                                  void *vm, target_ulong base, uint32_t desc) \
7262 {                                                                       \
7263     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7264               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7265 }                                                                       \
7266 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7267     void *vm, target_ulong base, uint32_t desc)                         \
7268 {                                                                       \
7269     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7270                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7271 }
7272 
7273 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7274 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7275 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7276 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7277 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7278 
7279 DO_ST1_ZPZ_S(bs, zss, MO_8)
7280 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7281 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7282 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7283 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7284 
7285 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7286 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7287 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7288 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7289 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7290 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7291 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7292 
7293 DO_ST1_ZPZ_D(bd, zss, MO_8)
7294 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7295 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7296 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7297 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7298 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7299 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7300 
7301 DO_ST1_ZPZ_D(bd, zd, MO_8)
7302 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7303 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7304 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7305 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7306 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7307 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7308 
7309 #undef DO_ST1_ZPZ_S
7310 #undef DO_ST1_ZPZ_D
7311 
7312 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7313 {
7314     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7315     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7316 
7317     for (i = 0; i < opr_sz; ++i) {
7318         d[i] = n[i] ^ m[i] ^ k[i];
7319     }
7320 }
7321 
7322 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7323 {
7324     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7325     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7326 
7327     for (i = 0; i < opr_sz; ++i) {
7328         d[i] = n[i] ^ (m[i] & ~k[i]);
7329     }
7330 }
7331 
7332 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7333 {
7334     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7335     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7336 
7337     for (i = 0; i < opr_sz; ++i) {
7338         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7339     }
7340 }
7341 
7342 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7343 {
7344     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7345     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7346 
7347     for (i = 0; i < opr_sz; ++i) {
7348         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7349     }
7350 }
7351 
7352 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7353 {
7354     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7355     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7356 
7357     for (i = 0; i < opr_sz; ++i) {
7358         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7359     }
7360 }
7361 
7362 /*
7363  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7364  * See hasless(v,1) from
7365  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7366  */
7367 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7368 {
7369     int bits = 8 << esz;
7370     uint64_t ones = dup_const(esz, 1);
7371     uint64_t signs = ones << (bits - 1);
7372     uint64_t cmp0, cmp1;
7373 
7374     cmp1 = dup_const(esz, n);
7375     cmp0 = cmp1 ^ m0;
7376     cmp1 = cmp1 ^ m1;
7377     cmp0 = (cmp0 - ones) & ~cmp0;
7378     cmp1 = (cmp1 - ones) & ~cmp1;
7379     return (cmp0 | cmp1) & signs;
7380 }
7381 
7382 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7383                                 uint32_t desc, int esz, bool nmatch)
7384 {
7385     uint16_t esz_mask = pred_esz_masks[esz];
7386     intptr_t opr_sz = simd_oprsz(desc);
7387     uint32_t flags = PREDTEST_INIT;
7388     intptr_t i, j, k;
7389 
7390     for (i = 0; i < opr_sz; i += 16) {
7391         uint64_t m0 = *(uint64_t *)(vm + i);
7392         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7393         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7394         uint16_t out = 0;
7395 
7396         for (j = 0; j < 16; j += 8) {
7397             uint64_t n = *(uint64_t *)(vn + i + j);
7398 
7399             for (k = 0; k < 8; k += 1 << esz) {
7400                 if (pg & (1 << (j + k))) {
7401                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7402                     out |= (o ^ nmatch) << (j + k);
7403                 }
7404             }
7405         }
7406         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7407         flags = iter_predtest_fwd(out, pg, flags);
7408     }
7409     return flags;
7410 }
7411 
7412 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7413 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7414 {                                                                             \
7415     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7416 }
7417 
7418 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7419 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7420 
7421 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7422 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7423 
7424 #undef DO_PPZZ_MATCH
7425 
7426 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7427                             uint32_t desc)
7428 {
7429     ARMVectorReg scratch;
7430     intptr_t i, j;
7431     intptr_t opr_sz = simd_oprsz(desc);
7432     uint32_t *d = vd, *n = vn, *m = vm;
7433     uint8_t *pg = vg;
7434 
7435     if (d == n) {
7436         n = memcpy(&scratch, n, opr_sz);
7437         if (d == m) {
7438             m = n;
7439         }
7440     } else if (d == m) {
7441         m = memcpy(&scratch, m, opr_sz);
7442     }
7443 
7444     for (i = 0; i < opr_sz; i += 4) {
7445         uint64_t count = 0;
7446         uint8_t pred;
7447 
7448         pred = pg[H1(i >> 3)] >> (i & 7);
7449         if (pred & 1) {
7450             uint32_t nn = n[H4(i >> 2)];
7451 
7452             for (j = 0; j <= i; j += 4) {
7453                 pred = pg[H1(j >> 3)] >> (j & 7);
7454                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7455                     ++count;
7456                 }
7457             }
7458         }
7459         d[H4(i >> 2)] = count;
7460     }
7461 }
7462 
7463 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7464                             uint32_t desc)
7465 {
7466     ARMVectorReg scratch;
7467     intptr_t i, j;
7468     intptr_t opr_sz = simd_oprsz(desc);
7469     uint64_t *d = vd, *n = vn, *m = vm;
7470     uint8_t *pg = vg;
7471 
7472     if (d == n) {
7473         n = memcpy(&scratch, n, opr_sz);
7474         if (d == m) {
7475             m = n;
7476         }
7477     } else if (d == m) {
7478         m = memcpy(&scratch, m, opr_sz);
7479     }
7480 
7481     for (i = 0; i < opr_sz / 8; ++i) {
7482         uint64_t count = 0;
7483         if (pg[H1(i)] & 1) {
7484             uint64_t nn = n[i];
7485             for (j = 0; j <= i; ++j) {
7486                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7487                     ++count;
7488                 }
7489             }
7490         }
7491         d[i] = count;
7492     }
7493 }
7494 
7495 /*
7496  * Returns the number of bytes in m0 and m1 that match n.
7497  * Unlike do_match2 we don't just need true/false, we need an exact count.
7498  * This requires two extra logical operations.
7499  */
7500 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7501 {
7502     const uint64_t mask = dup_const(MO_8, 0x7f);
7503     uint64_t cmp0, cmp1;
7504 
7505     cmp1 = dup_const(MO_8, n);
7506     cmp0 = cmp1 ^ m0;
7507     cmp1 = cmp1 ^ m1;
7508 
7509     /*
7510      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7511      * 2: carry in to msb if byte != 0 (+ mask)
7512      * 3: set msb if cmp has msb set (| cmp)
7513      * 4: set ~msb to ignore them (| mask)
7514      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7515      * 5: invert, resulting in 0x80 if and only if byte == 0.
7516      */
7517     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7518     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7519 
7520     /*
7521      * Combine the two compares in a way that the bits do
7522      * not overlap, and so preserves the count of set bits.
7523      * If the host has an efficient instruction for ctpop,
7524      * then ctpop(x) + ctpop(y) has the same number of
7525      * operations as ctpop(x | (y >> 1)).  If the host does
7526      * not have an efficient ctpop, then we only want to
7527      * use it once.
7528      */
7529     return ctpop64(cmp0 | (cmp1 >> 1));
7530 }
7531 
7532 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7533 {
7534     intptr_t i, j;
7535     intptr_t opr_sz = simd_oprsz(desc);
7536 
7537     for (i = 0; i < opr_sz; i += 16) {
7538         uint64_t n0 = *(uint64_t *)(vn + i);
7539         uint64_t m0 = *(uint64_t *)(vm + i);
7540         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7541         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7542         uint64_t out0 = 0;
7543         uint64_t out1 = 0;
7544 
7545         for (j = 0; j < 64; j += 8) {
7546             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7547             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7548             out0 |= cnt0 << j;
7549             out1 |= cnt1 << j;
7550         }
7551 
7552         *(uint64_t *)(vd + i) = out0;
7553         *(uint64_t *)(vd + i + 8) = out1;
7554     }
7555 }
7556 
7557 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7558 {
7559     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7560     int shr = simd_data(desc);
7561     int shl = 8 - shr;
7562     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7563     uint64_t *d = vd, *n = vn, *m = vm;
7564 
7565     for (i = 0; i < opr_sz; ++i) {
7566         uint64_t t = n[i] ^ m[i];
7567         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7568     }
7569 }
7570 
7571 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7572 {
7573     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7574     int shr = simd_data(desc);
7575     int shl = 16 - shr;
7576     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7577     uint64_t *d = vd, *n = vn, *m = vm;
7578 
7579     for (i = 0; i < opr_sz; ++i) {
7580         uint64_t t = n[i] ^ m[i];
7581         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7582     }
7583 }
7584 
7585 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7586 {
7587     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7588     int shr = simd_data(desc);
7589     uint32_t *d = vd, *n = vn, *m = vm;
7590 
7591     for (i = 0; i < opr_sz; ++i) {
7592         d[i] = ror32(n[i] ^ m[i], shr);
7593     }
7594 }
7595 
7596 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7597                      float_status *status, uint32_t desc)
7598 {
7599     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7600 
7601     for (s = 0; s < opr_sz; ++s) {
7602         float32 *n = vn + s * sizeof(float32) * 4;
7603         float32 *m = vm + s * sizeof(float32) * 4;
7604         float32 *a = va + s * sizeof(float32) * 4;
7605         float32 *d = vd + s * sizeof(float32) * 4;
7606         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7607         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7608         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7609         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7610         float32 p0, p1;
7611 
7612         /* i = 0, j = 0 */
7613         p0 = float32_mul(n00, m00, status);
7614         p1 = float32_mul(n01, m01, status);
7615         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7616 
7617         /* i = 0, j = 1 */
7618         p0 = float32_mul(n00, m10, status);
7619         p1 = float32_mul(n01, m11, status);
7620         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7621 
7622         /* i = 1, j = 0 */
7623         p0 = float32_mul(n10, m00, status);
7624         p1 = float32_mul(n11, m01, status);
7625         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7626 
7627         /* i = 1, j = 1 */
7628         p0 = float32_mul(n10, m10, status);
7629         p1 = float32_mul(n11, m11, status);
7630         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7631     }
7632 }
7633 
7634 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7635                      float_status *status, uint32_t desc)
7636 {
7637     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7638 
7639     for (s = 0; s < opr_sz; ++s) {
7640         float64 *n = vn + s * sizeof(float64) * 4;
7641         float64 *m = vm + s * sizeof(float64) * 4;
7642         float64 *a = va + s * sizeof(float64) * 4;
7643         float64 *d = vd + s * sizeof(float64) * 4;
7644         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7645         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7646         float64 p0, p1;
7647 
7648         /* i = 0, j = 0 */
7649         p0 = float64_mul(n00, m00, status);
7650         p1 = float64_mul(n01, m01, status);
7651         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7652 
7653         /* i = 0, j = 1 */
7654         p0 = float64_mul(n00, m10, status);
7655         p1 = float64_mul(n01, m11, status);
7656         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7657 
7658         /* i = 1, j = 0 */
7659         p0 = float64_mul(n10, m00, status);
7660         p1 = float64_mul(n11, m01, status);
7661         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7662 
7663         /* i = 1, j = 1 */
7664         p0 = float64_mul(n10, m10, status);
7665         p1 = float64_mul(n11, m11, status);
7666         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7667     }
7668 }
7669 
7670 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7671 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7672                   float_status *status, uint32_t desc)                        \
7673 {                                                                             \
7674     intptr_t i = simd_oprsz(desc);                                            \
7675     uint64_t *g = vg;                                                         \
7676     do {                                                                      \
7677         uint64_t pg = g[(i - 1) >> 6];                                        \
7678         do {                                                                  \
7679             i -= sizeof(TYPEW);                                               \
7680             if (likely((pg >> (i & 63)) & 1)) {                               \
7681                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7682                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7683             }                                                                 \
7684         } while (i & 63);                                                     \
7685     } while (i != 0);                                                         \
7686 }
7687 
7688 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7689 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7690 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7691 
7692 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7693 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7694                   float_status *status, uint32_t desc)                        \
7695 {                                                                             \
7696     intptr_t i = simd_oprsz(desc);                                            \
7697     uint64_t *g = vg;                                                         \
7698     do {                                                                      \
7699         uint64_t pg = g[(i - 1) >> 6];                                        \
7700         do {                                                                  \
7701             i -= sizeof(TYPEW);                                               \
7702             if (likely((pg >> (i & 63)) & 1)) {                               \
7703                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7704                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7705             }                                                                 \
7706         } while (i & 63);                                                     \
7707     } while (i != 0);                                                         \
7708 }
7709 
7710 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7711 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7712 
7713 #undef DO_FCVTLT
7714 #undef DO_FCVTNT
7715