xref: /qemu/target/arm/tcg/sve_helper.c (revision bcfee4938f8d4e8bf5f49981d3c8a78cf267cb4e)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "exec/target_page.h"
27 #include "exec/tlb-flags.h"
28 #include "tcg/tcg-gvec-desc.h"
29 #include "fpu/softfloat.h"
30 #include "tcg/tcg.h"
31 #include "vec_internal.h"
32 #include "sve_ldst_internal.h"
33 #include "accel/tcg/cpu-ldst.h"
34 #include "accel/tcg/cpu-ops.h"
35 #ifdef CONFIG_USER_ONLY
36 #include "user/page-protection.h"
37 #endif
38 
39 
40 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
41  *
42  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
43  * and bit 0 set if C is set.  Compare the definitions of these variables
44  * within CPUARMState.
45  */
46 
47 /* For no G bits set, NZCV = C.  */
48 #define PREDTEST_INIT  1
49 
50 /* This is an iterative function, called for each Pd and Pg word
51  * moving forward.
52  */
53 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
54 {
55     if (likely(g)) {
56         /* Compute N from first D & G.
57            Use bit 2 to signal first G bit seen.  */
58         if (!(flags & 4)) {
59             flags |= ((d & (g & -g)) != 0) << 31;
60             flags |= 4;
61         }
62 
63         /* Accumulate Z from each D & G.  */
64         flags |= ((d & g) != 0) << 1;
65 
66         /* Compute C from last !(D & G).  Replace previous.  */
67         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
68     }
69     return flags;
70 }
71 
72 /* This is an iterative function, called for each Pd and Pg word
73  * moving backward.
74  */
75 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
76 {
77     if (likely(g)) {
78         /* Compute C from first (i.e last) !(D & G).
79            Use bit 2 to signal first G bit seen.  */
80         if (!(flags & 4)) {
81             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
82             flags |= (d & pow2floor(g)) == 0;
83         }
84 
85         /* Accumulate Z from each D & G.  */
86         flags |= ((d & g) != 0) << 1;
87 
88         /* Compute N from last (i.e first) D & G.  Replace previous.  */
89         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
90     }
91     return flags;
92 }
93 
94 /* The same for a single word predicate.  */
95 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
96 {
97     return iter_predtest_fwd(d, g, PREDTEST_INIT);
98 }
99 
100 /* The same for a multi-word predicate.  */
101 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
102 {
103     uint32_t flags = PREDTEST_INIT;
104     uint64_t *d = vd, *g = vg;
105     uintptr_t i = 0;
106 
107     do {
108         flags = iter_predtest_fwd(d[i], g[i], flags);
109     } while (++i < words);
110 
111     return flags;
112 }
113 
114 /* Similarly for single word elements.  */
115 static inline uint64_t expand_pred_s(uint8_t byte)
116 {
117     static const uint64_t word[] = {
118         [0x01] = 0x00000000ffffffffull,
119         [0x10] = 0xffffffff00000000ull,
120         [0x11] = 0xffffffffffffffffull,
121     };
122     return word[byte & 0x11];
123 }
124 
125 #define LOGICAL_PPPP(NAME, FUNC) \
126 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
127 {                                                                         \
128     uintptr_t opr_sz = simd_oprsz(desc);                                  \
129     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
130     uintptr_t i;                                                          \
131     for (i = 0; i < opr_sz / 8; ++i) {                                    \
132         d[i] = FUNC(n[i], m[i], g[i]);                                    \
133     }                                                                     \
134 }
135 
136 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
137 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
138 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
139 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
140 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
141 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
142 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
143 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
144 
145 LOGICAL_PPPP(sve_and_pppp, DO_AND)
146 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
147 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
148 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
149 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
150 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
151 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
152 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
153 
154 #undef DO_AND
155 #undef DO_BIC
156 #undef DO_EOR
157 #undef DO_ORR
158 #undef DO_ORN
159 #undef DO_NOR
160 #undef DO_NAND
161 #undef DO_SEL
162 #undef LOGICAL_PPPP
163 
164 /* Fully general three-operand expander, controlled by a predicate.
165  * This is complicated by the host-endian storage of the register file.
166  */
167 /* ??? I don't expect the compiler could ever vectorize this itself.
168  * With some tables we can convert bit masks to byte masks, and with
169  * extra care wrt byte/word ordering we could use gcc generic vectors
170  * and do 16 bytes at a time.
171  */
172 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
173 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
174 {                                                                       \
175     intptr_t i, opr_sz = simd_oprsz(desc);                              \
176     for (i = 0; i < opr_sz; ) {                                         \
177         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
178         do {                                                            \
179             if (pg & 1) {                                               \
180                 TYPE nn = *(TYPE *)(vn + H(i));                         \
181                 TYPE mm = *(TYPE *)(vm + H(i));                         \
182                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
183             }                                                           \
184             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
185         } while (i & 15);                                               \
186     }                                                                   \
187 }
188 
189 /* Similarly, specialized for 64-bit operands.  */
190 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
191 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
192 {                                                               \
193     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
194     TYPE *d = vd, *n = vn, *m = vm;                             \
195     uint8_t *pg = vg;                                           \
196     for (i = 0; i < opr_sz; i += 1) {                           \
197         if (pg[H1(i)] & 1) {                                    \
198             TYPE nn = n[i], mm = m[i];                          \
199             d[i] = OP(nn, mm);                                  \
200         }                                                       \
201     }                                                           \
202 }
203 
204 #define DO_AND(N, M)  (N & M)
205 #define DO_EOR(N, M)  (N ^ M)
206 #define DO_ORR(N, M)  (N | M)
207 #define DO_BIC(N, M)  (N & ~M)
208 #define DO_ADD(N, M)  (N + M)
209 #define DO_SUB(N, M)  (N - M)
210 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
211 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
212 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
213 #define DO_MUL(N, M)  (N * M)
214 
215 
216 /*
217  * We must avoid the C undefined behaviour cases: division by
218  * zero and signed division of INT_MIN by -1. Both of these
219  * have architecturally defined required results for Arm.
220  * We special case all signed divisions by -1 to avoid having
221  * to deduce the minimum integer for the type involved.
222  */
223 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
224 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
225 
226 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
227 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
228 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
229 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
230 
231 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
232 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
233 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
234 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
235 
236 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
237 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
238 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
239 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
240 
241 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
242 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
243 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
244 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
245 
246 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
247 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
248 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
249 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
250 
251 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
252 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
253 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
254 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
255 
256 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
257 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
258 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
259 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
260 
261 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
262 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
263 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
264 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
265 
266 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
267 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
268 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
269 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
270 
271 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
272 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
273 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
274 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
275 
276 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
277 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
278 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
279 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
280 
281 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
282 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
283 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
284 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
285 
286 /* Because the computation type is at least twice as large as required,
287    these work for both signed and unsigned source types.  */
288 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
289 {
290     return (n * m) >> 8;
291 }
292 
293 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
294 {
295     return (n * m) >> 16;
296 }
297 
298 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
299 {
300     return (n * m) >> 32;
301 }
302 
303 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
304 {
305     uint64_t lo, hi;
306     muls64(&lo, &hi, n, m);
307     return hi;
308 }
309 
310 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
311 {
312     uint64_t lo, hi;
313     mulu64(&lo, &hi, n, m);
314     return hi;
315 }
316 
317 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
318 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
319 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
320 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
321 
322 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
323 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
324 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
325 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
326 
327 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
328 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
329 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
330 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
331 
332 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
333 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
334 
335 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
336 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
337 
338 /* Note that all bits of the shift are significant
339    and not modulo the element size.  */
340 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
341 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
342 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
343 
344 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
345 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
346 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
347 
348 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
349 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
350 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
351 
352 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
353 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
354 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
355 
356 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
357 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
358 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
359 
360 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
361 {
362     int8_t n1 = n, n2 = n >> 8;
363     return m + n1 + n2;
364 }
365 
366 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
367 {
368     int16_t n1 = n, n2 = n >> 16;
369     return m + n1 + n2;
370 }
371 
372 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
373 {
374     int32_t n1 = n, n2 = n >> 32;
375     return m + n1 + n2;
376 }
377 
378 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
379 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
380 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
381 
382 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
383 {
384     uint8_t n1 = n, n2 = n >> 8;
385     return m + n1 + n2;
386 }
387 
388 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
389 {
390     uint16_t n1 = n, n2 = n >> 16;
391     return m + n1 + n2;
392 }
393 
394 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
395 {
396     uint32_t n1 = n, n2 = n >> 32;
397     return m + n1 + n2;
398 }
399 
400 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
401 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
402 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
403 
404 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
405 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
406 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
407 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
408 
409 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
410 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
411 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
412 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
413 
414 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
415 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
416 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
417 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
418 
419 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
420 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
421 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
422 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
423 
424 /*
425  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
426  * We pass in a pointer to a dummy saturation field to trigger
427  * the saturating arithmetic but discard the information about
428  * whether it has occurred.
429  */
430 #define do_sqshl_b(n, m) \
431    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
432 #define do_sqshl_h(n, m) \
433    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
434 #define do_sqshl_s(n, m) \
435    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
436 #define do_sqshl_d(n, m) \
437    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
438 
439 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
440 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
441 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
442 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
443 
444 #define do_uqshl_b(n, m) \
445    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
446 #define do_uqshl_h(n, m) \
447    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
448 #define do_uqshl_s(n, m) \
449    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
450 #define do_uqshl_d(n, m) \
451    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
452 
453 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
454 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
455 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
456 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
457 
458 #define do_sqrshl_b(n, m) \
459    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
460 #define do_sqrshl_h(n, m) \
461    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
462 #define do_sqrshl_s(n, m) \
463    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
464 #define do_sqrshl_d(n, m) \
465    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
466 
467 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
468 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
469 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
470 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
471 
472 #undef do_sqrshl_d
473 
474 #define do_uqrshl_b(n, m) \
475    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
476 #define do_uqrshl_h(n, m) \
477    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
478 #define do_uqrshl_s(n, m) \
479    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
480 #define do_uqrshl_d(n, m) \
481    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
482 
483 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
484 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
485 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
486 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
487 
488 #undef do_uqrshl_d
489 
490 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
491 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
492 
493 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
494 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
495 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
496 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
497 
498 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
499 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
500 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
501 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
502 
503 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
504 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
505 
506 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
507 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
508 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
509 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
510 
511 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
512 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
513 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
514 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
515 
516 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
517 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
518 
519 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
520 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
521 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
522 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
523 
524 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
525 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
526 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
527 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
528 
529 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
530 {
531     return val >= max ? max : val <= min ? min : val;
532 }
533 
534 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
535 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
536 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
537 
538 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
539 {
540     int64_t r = n + m;
541     if (((r ^ n) & ~(n ^ m)) < 0) {
542         /* Signed overflow.  */
543         return r < 0 ? INT64_MAX : INT64_MIN;
544     }
545     return r;
546 }
547 
548 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
549 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
550 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
551 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
552 
553 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
554 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
555 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
556 
557 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
558 {
559     uint64_t r = n + m;
560     return r < n ? UINT64_MAX : r;
561 }
562 
563 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
564 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
565 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
566 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
567 
568 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
569 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
570 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
571 
572 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
573 {
574     int64_t r = n - m;
575     if (((r ^ n) & (n ^ m)) < 0) {
576         /* Signed overflow.  */
577         return r < 0 ? INT64_MAX : INT64_MIN;
578     }
579     return r;
580 }
581 
582 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
583 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
584 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
585 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
586 
587 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
588 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
589 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
590 
591 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
592 {
593     return n > m ? n - m : 0;
594 }
595 
596 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
597 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
598 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
599 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
600 
601 #define DO_SUQADD_B(n, m) \
602     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
603 #define DO_SUQADD_H(n, m) \
604     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
605 #define DO_SUQADD_S(n, m) \
606     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
607 
608 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
609 {
610     uint64_t r = n + m;
611 
612     if (n < 0) {
613         /* Note that m - abs(n) cannot underflow. */
614         if (r > INT64_MAX) {
615             /* Result is either very large positive or negative. */
616             if (m > -n) {
617                 /* m > abs(n), so r is a very large positive. */
618                 return INT64_MAX;
619             }
620             /* Result is negative. */
621         }
622     } else {
623         /* Both inputs are positive: check for overflow.  */
624         if (r < m || r > INT64_MAX) {
625             return INT64_MAX;
626         }
627     }
628     return r;
629 }
630 
631 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
632 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
633 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
634 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
635 
636 #define DO_USQADD_B(n, m) \
637     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
638 #define DO_USQADD_H(n, m) \
639     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
640 #define DO_USQADD_S(n, m) \
641     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
642 
643 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
644 {
645     uint64_t r = n + m;
646 
647     if (m < 0) {
648         return n < -m ? 0 : r;
649     }
650     return r < n ? UINT64_MAX : r;
651 }
652 
653 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
654 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
655 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
656 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
657 
658 #undef DO_ZPZZ
659 #undef DO_ZPZZ_D
660 
661 /*
662  * Three operand expander, operating on element pairs.
663  * If the slot I is even, the elements from from VN {I, I+1}.
664  * If the slot I is odd, the elements from from VM {I-1, I}.
665  * Load all of the input elements in each pair before overwriting output.
666  */
667 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
669 {                                                               \
670     intptr_t i, opr_sz = simd_oprsz(desc);                      \
671     for (i = 0; i < opr_sz; ) {                                 \
672         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
673         do {                                                    \
674             TYPE n0 = *(TYPE *)(vn + H(i));                     \
675             TYPE m0 = *(TYPE *)(vm + H(i));                     \
676             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
677             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
678             if (pg & 1) {                                       \
679                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
680             }                                                   \
681             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
682             if (pg & 1) {                                       \
683                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
684             }                                                   \
685             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
686         } while (i & 15);                                       \
687     }                                                           \
688 }
689 
690 /* Similarly, specialized for 64-bit operands.  */
691 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
692 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
693 {                                                               \
694     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
695     TYPE *d = vd, *n = vn, *m = vm;                             \
696     uint8_t *pg = vg;                                           \
697     for (i = 0; i < opr_sz; i += 2) {                           \
698         TYPE n0 = n[i], n1 = n[i + 1];                          \
699         TYPE m0 = m[i], m1 = m[i + 1];                          \
700         if (pg[H1(i)] & 1) {                                    \
701             d[i] = OP(n0, n1);                                  \
702         }                                                       \
703         if (pg[H1(i + 1)] & 1) {                                \
704             d[i + 1] = OP(m0, m1);                              \
705         }                                                       \
706     }                                                           \
707 }
708 
709 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
710 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
711 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
712 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
713 
714 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
715 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
716 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
717 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
718 
719 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
720 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
721 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
722 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
723 
724 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
725 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
726 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
727 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
728 
729 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
730 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
731 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
732 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
733 
734 #undef DO_ZPZZ_PAIR
735 #undef DO_ZPZZ_PAIR_D
736 
737 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
738 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
739                   float_status *status, uint32_t desc)                  \
740 {                                                                       \
741     intptr_t i, opr_sz = simd_oprsz(desc);                              \
742     for (i = 0; i < opr_sz; ) {                                         \
743         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
744         do {                                                            \
745             TYPE n0 = *(TYPE *)(vn + H(i));                             \
746             TYPE m0 = *(TYPE *)(vm + H(i));                             \
747             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
748             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
749             if (pg & 1) {                                               \
750                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
751             }                                                           \
752             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
753             if (pg & 1) {                                               \
754                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
755             }                                                           \
756             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
757         } while (i & 15);                                               \
758     }                                                                   \
759 }
760 
761 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
762 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
763 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
764 
765 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
766 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
767 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
768 
769 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
770 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
771 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
772 
773 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
774 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
775 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
776 
777 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
778 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
779 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
780 
781 #undef DO_ZPZZ_PAIR_FP
782 
783 /* Three-operand expander, controlled by a predicate, in which the
784  * third operand is "wide".  That is, for D = N op M, the same 64-bit
785  * value of M is used with all of the narrower values of N.
786  */
787 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
788 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
789 {                                                                       \
790     intptr_t i, opr_sz = simd_oprsz(desc);                              \
791     for (i = 0; i < opr_sz; ) {                                         \
792         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
793         TYPEW mm = *(TYPEW *)(vm + i);                                  \
794         do {                                                            \
795             if (pg & 1) {                                               \
796                 TYPE nn = *(TYPE *)(vn + H(i));                         \
797                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
798             }                                                           \
799             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
800         } while (i & 7);                                                \
801     }                                                                   \
802 }
803 
804 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
805 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
806 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
807 
808 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
809 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
810 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
811 
812 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
813 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
814 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
815 
816 #undef DO_ZPZW
817 
818 /* Fully general two-operand expander, controlled by a predicate.
819  */
820 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
821 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
822 {                                                               \
823     intptr_t i, opr_sz = simd_oprsz(desc);                      \
824     for (i = 0; i < opr_sz; ) {                                 \
825         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
826         do {                                                    \
827             if (pg & 1) {                                       \
828                 TYPE nn = *(TYPE *)(vn + H(i));                 \
829                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
830             }                                                   \
831             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
832         } while (i & 15);                                       \
833     }                                                           \
834 }
835 
836 /* Similarly, specialized for 64-bit operands.  */
837 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
838 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
839 {                                                               \
840     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
841     TYPE *d = vd, *n = vn;                                      \
842     uint8_t *pg = vg;                                           \
843     for (i = 0; i < opr_sz; i += 1) {                           \
844         if (pg[H1(i)] & 1) {                                    \
845             TYPE nn = n[i];                                     \
846             d[i] = OP(nn);                                      \
847         }                                                       \
848     }                                                           \
849 }
850 
851 #define DO_CLS_B(N)   (clrsb32(N) - 24)
852 #define DO_CLS_H(N)   (clrsb32(N) - 16)
853 
854 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
855 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
856 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
857 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
858 
859 #define DO_CLZ_B(N)   (clz32(N) - 24)
860 #define DO_CLZ_H(N)   (clz32(N) - 16)
861 
862 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
863 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
864 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
865 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
866 
867 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
868 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
869 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
870 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
871 
872 #define DO_CNOT(N)    (N == 0)
873 
874 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
875 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
876 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
877 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
878 
879 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
880 
881 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
882 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
883 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
884 
885 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
886 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
887 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
888 
889 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
890 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
891 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
892 
893 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
894 
895 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
896 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
897 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
898 
899 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
900 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
901 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
902 
903 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
904 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
905 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
906 
907 #define DO_NOT(N)    (~N)
908 
909 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
910 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
911 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
912 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
913 
914 #define DO_SXTB(N)    ((int8_t)N)
915 #define DO_SXTH(N)    ((int16_t)N)
916 #define DO_SXTS(N)    ((int32_t)N)
917 #define DO_UXTB(N)    ((uint8_t)N)
918 #define DO_UXTH(N)    ((uint16_t)N)
919 #define DO_UXTS(N)    ((uint32_t)N)
920 
921 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
922 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
923 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
924 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
925 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
926 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
927 
928 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
929 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
930 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
931 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
932 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
933 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
934 
935 #define DO_ABS(N)    (N < 0 ? -N : N)
936 
937 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
938 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
939 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
940 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
941 
942 #define DO_NEG(N)    (-N)
943 
944 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
945 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
946 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
947 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
948 
949 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
950 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
951 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
952 
953 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
954 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
955 
956 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
957 
958 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
959 {
960     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
961     uint64_t *d = vd, *n = vn;
962     uint8_t *pg = vg;
963 
964     for (i = 0; i < opr_sz; i += 2) {
965         if (pg[H1(i)] & 1) {
966             uint64_t n0 = n[i + 0];
967             uint64_t n1 = n[i + 1];
968             d[i + 0] = n1;
969             d[i + 1] = n0;
970         }
971     }
972 }
973 
974 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
975 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
976 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
977 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
978 
979 #define DO_SQABS(X) \
980     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
981        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
982 
983 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
984 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
985 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
986 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
987 
988 #define DO_SQNEG(X) \
989     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
990        x_ == min_ ? -min_ - 1 : -x_; })
991 
992 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
993 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
994 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
995 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
996 
997 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
998 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
999 
1000 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1001  */
1002 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
1003 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1004 {                                                              \
1005     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1006     for (i = 0; i < opr_sz; ) {                                \
1007         TYPEW mm = *(TYPEW *)(vm + i);                         \
1008         do {                                                   \
1009             TYPE nn = *(TYPE *)(vn + H(i));                    \
1010             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
1011             i += sizeof(TYPE);                                 \
1012         } while (i & 7);                                       \
1013     }                                                          \
1014 }
1015 
1016 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1017 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1018 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1019 
1020 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1021 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1022 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1023 
1024 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1025 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1026 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1027 
1028 #undef DO_ZZW
1029 
1030 #undef DO_CLS_B
1031 #undef DO_CLS_H
1032 #undef DO_CLZ_B
1033 #undef DO_CLZ_H
1034 #undef DO_CNOT
1035 #undef DO_FABS
1036 #undef DO_FNEG
1037 #undef DO_ABS
1038 #undef DO_NEG
1039 #undef DO_ZPZ
1040 #undef DO_ZPZ_D
1041 
1042 /*
1043  * Three-operand expander, unpredicated, in which the two inputs are
1044  * selected from the top or bottom half of the wide column.
1045  */
1046 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1047 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1048 {                                                                       \
1049     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1050     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1051     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1052     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1053         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1054         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1055         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1056     }                                                                   \
1057 }
1058 
1059 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1060 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1061 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1062 
1063 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1064 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1065 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1066 
1067 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1068 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1069 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1070 
1071 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1072 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1073 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1074 
1075 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1076 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1077 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1078 
1079 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1080 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1081 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1082 
1083 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1084 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1085 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1086 
1087 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1088 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1089 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1090 
1091 /* Note that the multiply cannot overflow, but the doubling can. */
1092 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1093 {
1094     int16_t val = n * m;
1095     return DO_SQADD_H(val, val);
1096 }
1097 
1098 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1099 {
1100     int32_t val = n * m;
1101     return DO_SQADD_S(val, val);
1102 }
1103 
1104 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1105 {
1106     int64_t val = n * m;
1107     return do_sqadd_d(val, val);
1108 }
1109 
1110 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1111 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1112 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1113 
1114 #undef DO_ZZZ_TB
1115 
1116 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1117 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1118 {                                                              \
1119     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1120     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1121     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1122         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1123         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1124         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1125     }                                                          \
1126 }
1127 
1128 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1129 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1130 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1131 
1132 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1133 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1134 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1135 
1136 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1137 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1138 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1139 
1140 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1141 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1142 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1143 
1144 #undef DO_ZZZ_WTB
1145 
1146 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1147 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1148 {                                                                       \
1149     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1150     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1151     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1152     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1153         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1154         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1155         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1156     }                                                                   \
1157 }
1158 
1159 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1160 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1161 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1162 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1163 
1164 #undef DO_ZZZ_NTB
1165 
1166 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1167 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1168 {                                                               \
1169     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1170     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1171     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1172         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1173         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1174         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1175         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1176     }                                                           \
1177 }
1178 
1179 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1180 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1181 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1182 
1183 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1184 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1185 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1186 
1187 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1188 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1189 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1190 
1191 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1192 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1193 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1194 
1195 #define DO_NMUL(N, M)  -(N * M)
1196 
1197 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1198 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1200 
1201 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1202 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1203 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1204 
1205 #undef DO_ZZZW_ACC
1206 
1207 #define DO_XTNB(NAME, TYPE, OP) \
1208 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1209 {                                                            \
1210     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1211     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1212         TYPE nn = *(TYPE *)(vn + i);                         \
1213         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1214         *(TYPE *)(vd + i) = nn;                              \
1215     }                                                        \
1216 }
1217 
1218 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1219 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1220 {                                                                       \
1221     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1222     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1223         TYPE nn = *(TYPE *)(vn + i);                                    \
1224         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1225     }                                                                   \
1226 }
1227 
1228 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1229 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1230 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1231 
1232 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1233 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1234 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1235 
1236 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1237 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1238 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1239 
1240 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1241 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1242 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1243 
1244 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1245 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1246 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1247 
1248 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1249 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1250 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1251 
1252 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1253 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1254 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1255 
1256 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1257 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1258 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1259 
1260 #undef DO_XTNB
1261 #undef DO_XTNT
1262 
1263 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1264 {
1265     intptr_t i, opr_sz = simd_oprsz(desc);
1266     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1267     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1268     uint32_t *a = va, *n = vn;
1269     uint64_t *d = vd, *m = vm;
1270 
1271     for (i = 0; i < opr_sz / 8; ++i) {
1272         uint32_t e1 = a[2 * i + H4(0)];
1273         uint32_t e2 = n[2 * i + sel] ^ inv;
1274         uint64_t c = extract64(m[i], 32, 1);
1275         /* Compute and store the entire 33-bit result at once. */
1276         d[i] = c + e1 + e2;
1277     }
1278 }
1279 
1280 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1281 {
1282     intptr_t i, opr_sz = simd_oprsz(desc);
1283     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1284     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1285     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1286 
1287     for (i = 0; i < opr_sz / 8; i += 2) {
1288         Int128 e1 = int128_make64(a[i]);
1289         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1290         Int128 c = int128_make64(m[i + 1] & 1);
1291         Int128 r = int128_add(int128_add(e1, e2), c);
1292         d[i + 0] = int128_getlo(r);
1293         d[i + 1] = int128_gethi(r);
1294     }
1295 }
1296 
1297 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1298 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1299 {                                                                       \
1300     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1301     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1302     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1303     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1304         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1305         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1306         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1307         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1308     }                                                                   \
1309 }
1310 
1311 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1312            do_sqdmull_h, DO_SQADD_H)
1313 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1314            do_sqdmull_s, DO_SQADD_S)
1315 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1316            do_sqdmull_d, do_sqadd_d)
1317 
1318 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1319            do_sqdmull_h, DO_SQSUB_H)
1320 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1321            do_sqdmull_s, DO_SQSUB_S)
1322 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1323            do_sqdmull_d, do_sqsub_d)
1324 
1325 #undef DO_SQDMLAL
1326 
1327 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1328 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1329 {                                                               \
1330     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1331     int rot = simd_data(desc);                                  \
1332     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1333     bool sub_r = rot == 1 || rot == 2;                          \
1334     bool sub_i = rot >= 2;                                      \
1335     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1336     for (i = 0; i < opr_sz; i += 2) {                           \
1337         TYPE elt1_a = n[H(i + sel_a)];                          \
1338         TYPE elt2_a = m[H(i + sel_a)];                          \
1339         TYPE elt2_b = m[H(i + sel_b)];                          \
1340         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1341         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1342     }                                                           \
1343 }
1344 
1345 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1346 
1347 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1348 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1349 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1350 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1351 
1352 #define DO_SQRDMLAH_B(N, M, A, S) \
1353     do_sqrdmlah_b(N, M, A, S, true)
1354 #define DO_SQRDMLAH_H(N, M, A, S) \
1355     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1356 #define DO_SQRDMLAH_S(N, M, A, S) \
1357     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1358 #define DO_SQRDMLAH_D(N, M, A, S) \
1359     do_sqrdmlah_d(N, M, A, S, true)
1360 
1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1362 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1363 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1364 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1365 
1366 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1367 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1368 {                                                                           \
1369     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1370     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1371     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1372     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1373     bool sub_r = rot == 1 || rot == 2;                                      \
1374     bool sub_i = rot >= 2;                                                  \
1375     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1376     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1377         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1378         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1379         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1380             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1381             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1382             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1383         }                                                                   \
1384     }                                                                       \
1385 }
1386 
1387 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1388 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1389 
1390 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1391 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1392 
1393 #undef DO_CMLA
1394 #undef DO_CMLA_FUNC
1395 #undef DO_CMLA_IDX_FUNC
1396 #undef DO_SQRDMLAH_B
1397 #undef DO_SQRDMLAH_H
1398 #undef DO_SQRDMLAH_S
1399 #undef DO_SQRDMLAH_D
1400 
1401 /* Note N and M are 4 elements bundled into one unit. */
1402 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1403                          int sel_a, int sel_b, int sub_i)
1404 {
1405     for (int i = 0; i <= 1; i++) {
1406         int32_t elt1_r = (int8_t)(n >> (16 * i));
1407         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1408         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1409         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1410 
1411         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1412     }
1413     return a;
1414 }
1415 
1416 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1417                          int sel_a, int sel_b, int sub_i)
1418 {
1419     for (int i = 0; i <= 1; i++) {
1420         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1421         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1422         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1423         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1424 
1425         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1426     }
1427     return a;
1428 }
1429 
1430 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1431                               void *va, uint32_t desc)
1432 {
1433     int opr_sz = simd_oprsz(desc);
1434     int rot = simd_data(desc);
1435     int sel_a = rot & 1;
1436     int sel_b = sel_a ^ 1;
1437     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1438     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1439 
1440     for (int e = 0; e < opr_sz / 4; e++) {
1441         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1442     }
1443 }
1444 
1445 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1446                               void *va, uint32_t desc)
1447 {
1448     int opr_sz = simd_oprsz(desc);
1449     int rot = simd_data(desc);
1450     int sel_a = rot & 1;
1451     int sel_b = sel_a ^ 1;
1452     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1453     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1454 
1455     for (int e = 0; e < opr_sz / 8; e++) {
1456         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1457     }
1458 }
1459 
1460 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1461                              void *va, uint32_t desc)
1462 {
1463     int opr_sz = simd_oprsz(desc);
1464     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1465     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1466     int sel_a = rot & 1;
1467     int sel_b = sel_a ^ 1;
1468     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1469     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1470 
1471     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1472         uint32_t seg_m = m[seg + idx];
1473         for (int e = 0; e < 4; e++) {
1474             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1475                                    sel_a, sel_b, sub_i);
1476         }
1477     }
1478 }
1479 
1480 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1481                              void *va, uint32_t desc)
1482 {
1483     int seg, opr_sz = simd_oprsz(desc);
1484     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1485     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1486     int sel_a = rot & 1;
1487     int sel_b = sel_a ^ 1;
1488     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1489     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1490 
1491     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1492         uint64_t seg_m = m[seg + idx];
1493         for (int e = 0; e < 2; e++) {
1494             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1495                                    sel_a, sel_b, sub_i);
1496         }
1497     }
1498 }
1499 
1500 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1501 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1502 {                                                                       \
1503     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1504     intptr_t i, j, idx = simd_data(desc);                               \
1505     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1506     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1507         TYPE mm = m[i];                                                 \
1508         for (j = 0; j < segment; j++) {                                 \
1509             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1510         }                                                               \
1511     }                                                                   \
1512 }
1513 
1514 #define DO_SQRDMLAH_H(N, M, A) \
1515     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1516 #define DO_SQRDMLAH_S(N, M, A) \
1517     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1518 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1519 
1520 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1521 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1522 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1523 
1524 #define DO_SQRDMLSH_H(N, M, A) \
1525     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1526 #define DO_SQRDMLSH_S(N, M, A) \
1527     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1528 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1529 
1530 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1531 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1532 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1533 
1534 #undef DO_ZZXZ
1535 
1536 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1537 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1538 {                                                                         \
1539     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1540     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1541     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1542     for (i = 0; i < oprsz; i += 16) {                                     \
1543         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1544         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1545             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1546             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1547             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1548         }                                                                 \
1549     }                                                                     \
1550 }
1551 
1552 #define DO_MLA(N, M, A)  (A + N * M)
1553 
1554 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1555 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1556 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1557 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1558 
1559 #define DO_MLS(N, M, A)  (A - N * M)
1560 
1561 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1562 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1563 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1564 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1565 
1566 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1567 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1568 
1569 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1570 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1571 
1572 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1573 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1574 
1575 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1576 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1577 
1578 #undef DO_MLA
1579 #undef DO_MLS
1580 #undef DO_ZZXW
1581 
1582 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1583 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1584 {                                                                         \
1585     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1586     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1587     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1588     for (i = 0; i < oprsz; i += 16) {                                     \
1589         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1590         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1591             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1592             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1593         }                                                                 \
1594     }                                                                     \
1595 }
1596 
1597 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1598 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1599 
1600 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1601 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1602 
1603 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1604 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1605 
1606 #undef DO_ZZX
1607 
1608 #define DO_BITPERM(NAME, TYPE, OP) \
1609 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1610 {                                                              \
1611     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1612     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1613         TYPE nn = *(TYPE *)(vn + i);                           \
1614         TYPE mm = *(TYPE *)(vm + i);                           \
1615         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1616     }                                                          \
1617 }
1618 
1619 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1620 {
1621     uint64_t res = 0;
1622     int db, rb = 0;
1623 
1624     for (db = 0; db < n; ++db) {
1625         if ((mask >> db) & 1) {
1626             res |= ((data >> db) & 1) << rb;
1627             ++rb;
1628         }
1629     }
1630     return res;
1631 }
1632 
1633 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1634 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1635 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1636 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1637 
1638 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1639 {
1640     uint64_t res = 0;
1641     int rb, db = 0;
1642 
1643     for (rb = 0; rb < n; ++rb) {
1644         if ((mask >> rb) & 1) {
1645             res |= ((data >> db) & 1) << rb;
1646             ++db;
1647         }
1648     }
1649     return res;
1650 }
1651 
1652 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1653 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1654 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1655 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1656 
1657 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1658 {
1659     uint64_t resm = 0, resu = 0;
1660     int db, rbm = 0, rbu = 0;
1661 
1662     for (db = 0; db < n; ++db) {
1663         uint64_t val = (data >> db) & 1;
1664         if ((mask >> db) & 1) {
1665             resm |= val << rbm++;
1666         } else {
1667             resu |= val << rbu++;
1668         }
1669     }
1670 
1671     return resm | (resu << rbm);
1672 }
1673 
1674 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1675 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1676 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1677 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1678 
1679 #undef DO_BITPERM
1680 
1681 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1682 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1683 {                                                               \
1684     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1685     int sub_r = simd_data(desc);                                \
1686     if (sub_r) {                                                \
1687         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1688             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1689             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1690             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1691             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1692             acc_r = ADD_OP(acc_r, el2_i);                       \
1693             acc_i = SUB_OP(acc_i, el2_r);                       \
1694             *(TYPE *)(vd + H(i)) = acc_r;                       \
1695             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1696         }                                                       \
1697     } else {                                                    \
1698         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1699             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1700             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1701             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1702             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1703             acc_r = SUB_OP(acc_r, el2_i);                       \
1704             acc_i = ADD_OP(acc_i, el2_r);                       \
1705             *(TYPE *)(vd + H(i)) = acc_r;                       \
1706             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1707         }                                                       \
1708     }                                                           \
1709 }
1710 
1711 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1712 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1713 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1714 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1715 
1716 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1717 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1718 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1719 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1720 
1721 #undef DO_CADD
1722 
1723 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1724 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1725 {                                                              \
1726     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1727     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1728     int shift = simd_data(desc) >> 1;                          \
1729     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1730         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1731         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1732     }                                                          \
1733 }
1734 
1735 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1736 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1737 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1738 
1739 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1740 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1741 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1742 
1743 #undef DO_ZZI_SHLL
1744 
1745 /* Two-operand reduction expander, controlled by a predicate.
1746  * The difference between TYPERED and TYPERET has to do with
1747  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1748  * but TYPERET must be unsigned so that e.g. a 32-bit value
1749  * is not sign-extended to the ABI uint64_t return type.
1750  */
1751 /* ??? If we were to vectorize this by hand the reduction ordering
1752  * would change.  For integer operands, this is perfectly fine.
1753  */
1754 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1755 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1756 {                                                          \
1757     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1758     TYPERED ret = INIT;                                    \
1759     for (i = 0; i < opr_sz; ) {                            \
1760         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1761         do {                                               \
1762             if (pg & 1) {                                  \
1763                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1764                 ret = OP(ret, nn);                         \
1765             }                                              \
1766             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1767         } while (i & 15);                                  \
1768     }                                                      \
1769     return (TYPERET)ret;                                   \
1770 }
1771 
1772 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1773 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1774 {                                                          \
1775     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1776     TYPEE *n = vn;                                         \
1777     uint8_t *pg = vg;                                      \
1778     TYPER ret = INIT;                                      \
1779     for (i = 0; i < opr_sz; i += 1) {                      \
1780         if (pg[H1(i)] & 1) {                               \
1781             TYPEE nn = n[i];                               \
1782             ret = OP(ret, nn);                             \
1783         }                                                  \
1784     }                                                      \
1785     return ret;                                            \
1786 }
1787 
1788 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1789 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1790 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1791 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1792 
1793 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1794 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1795 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1796 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1797 
1798 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1799 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1800 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1801 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1802 
1803 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1804 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1805 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1806 
1807 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1808 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1809 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1810 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1811 
1812 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1813 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1814 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1815 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1816 
1817 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1818 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1819 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1820 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1821 
1822 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1823 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1824 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1825 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1826 
1827 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1828 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1829 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1830 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1831 
1832 #undef DO_VPZ
1833 #undef DO_VPZ_D
1834 
1835 /* Two vector operand, one scalar operand, unpredicated.  */
1836 #define DO_ZZI(NAME, TYPE, OP)                                       \
1837 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1838 {                                                                    \
1839     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1840     TYPE s = s64, *d = vd, *n = vn;                                  \
1841     for (i = 0; i < opr_sz; ++i) {                                   \
1842         d[i] = OP(n[i], s);                                          \
1843     }                                                                \
1844 }
1845 
1846 #define DO_SUBR(X, Y)   (Y - X)
1847 
1848 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1849 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1850 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1851 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1852 
1853 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1854 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1855 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1856 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1857 
1858 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1859 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1860 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1861 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1862 
1863 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1864 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1865 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1866 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1867 
1868 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1869 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1870 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1871 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1872 
1873 #undef DO_ZZI
1874 
1875 #undef DO_AND
1876 #undef DO_ORR
1877 #undef DO_EOR
1878 #undef DO_BIC
1879 #undef DO_ADD
1880 #undef DO_SUB
1881 #undef DO_MAX
1882 #undef DO_MIN
1883 #undef DO_ABD
1884 #undef DO_MUL
1885 #undef DO_DIV
1886 #undef DO_ASR
1887 #undef DO_LSR
1888 #undef DO_LSL
1889 #undef DO_SUBR
1890 
1891 /* Similar to the ARM LastActiveElement pseudocode function, except the
1892    result is multiplied by the element size.  This includes the not found
1893    indication; e.g. not found for esz=3 is -8.  */
1894 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1895 {
1896     uint64_t mask = pred_esz_masks[esz];
1897     intptr_t i = words;
1898 
1899     do {
1900         uint64_t this_g = g[--i] & mask;
1901         if (this_g) {
1902             return i * 64 + (63 - clz64(this_g));
1903         }
1904     } while (i > 0);
1905     return (intptr_t)-1 << esz;
1906 }
1907 
1908 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1909 {
1910     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1911     uint32_t flags = PREDTEST_INIT;
1912     uint64_t *d = vd, *g = vg;
1913     intptr_t i = 0;
1914 
1915     do {
1916         uint64_t this_d = d[i];
1917         uint64_t this_g = g[i];
1918 
1919         if (this_g) {
1920             if (!(flags & 4)) {
1921                 /* Set in D the first bit of G.  */
1922                 this_d |= this_g & -this_g;
1923                 d[i] = this_d;
1924             }
1925             flags = iter_predtest_fwd(this_d, this_g, flags);
1926         }
1927     } while (++i < words);
1928 
1929     return flags;
1930 }
1931 
1932 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1933 {
1934     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1935     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1936     uint32_t flags = PREDTEST_INIT;
1937     uint64_t *d = vd, *g = vg, esz_mask;
1938     intptr_t i, next;
1939 
1940     next = last_active_element(vd, words, esz) + (1 << esz);
1941     esz_mask = pred_esz_masks[esz];
1942 
1943     /* Similar to the pseudocode for pnext, but scaled by ESZ
1944        so that we find the correct bit.  */
1945     if (next < words * 64) {
1946         uint64_t mask = -1;
1947 
1948         if (next & 63) {
1949             mask = ~((1ull << (next & 63)) - 1);
1950             next &= -64;
1951         }
1952         do {
1953             uint64_t this_g = g[next / 64] & esz_mask & mask;
1954             if (this_g != 0) {
1955                 next = (next & -64) + ctz64(this_g);
1956                 break;
1957             }
1958             next += 64;
1959             mask = -1;
1960         } while (next < words * 64);
1961     }
1962 
1963     i = 0;
1964     do {
1965         uint64_t this_d = 0;
1966         if (i == next / 64) {
1967             this_d = 1ull << (next & 63);
1968         }
1969         d[i] = this_d;
1970         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1971     } while (++i < words);
1972 
1973     return flags;
1974 }
1975 
1976 /*
1977  * Copy Zn into Zd, and store zero into inactive elements.
1978  * If inv, store zeros into the active elements.
1979  */
1980 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1981 {
1982     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1983     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1984     uint64_t *d = vd, *n = vn;
1985     uint8_t *pg = vg;
1986 
1987     for (i = 0; i < opr_sz; i += 1) {
1988         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1989     }
1990 }
1991 
1992 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1993 {
1994     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1995     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1996     uint64_t *d = vd, *n = vn;
1997     uint8_t *pg = vg;
1998 
1999     for (i = 0; i < opr_sz; i += 1) {
2000         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
2001     }
2002 }
2003 
2004 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2005 {
2006     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2007     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2008     uint64_t *d = vd, *n = vn;
2009     uint8_t *pg = vg;
2010 
2011     for (i = 0; i < opr_sz; i += 1) {
2012         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2013     }
2014 }
2015 
2016 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2017 {
2018     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2019     uint64_t *d = vd, *n = vn;
2020     uint8_t *pg = vg;
2021     uint8_t inv = simd_data(desc);
2022 
2023     for (i = 0; i < opr_sz; i += 1) {
2024         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2025     }
2026 }
2027 
2028 /* Three-operand expander, immediate operand, controlled by a predicate.
2029  */
2030 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2031 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2032 {                                                               \
2033     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2034     TYPE imm = simd_data(desc);                                 \
2035     for (i = 0; i < opr_sz; ) {                                 \
2036         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2037         do {                                                    \
2038             if (pg & 1) {                                       \
2039                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2040                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2041             }                                                   \
2042             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2043         } while (i & 15);                                       \
2044     }                                                           \
2045 }
2046 
2047 /* Similarly, specialized for 64-bit operands.  */
2048 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2049 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2050 {                                                               \
2051     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2052     TYPE *d = vd, *n = vn;                                      \
2053     TYPE imm = simd_data(desc);                                 \
2054     uint8_t *pg = vg;                                           \
2055     for (i = 0; i < opr_sz; i += 1) {                           \
2056         if (pg[H1(i)] & 1) {                                    \
2057             TYPE nn = n[i];                                     \
2058             d[i] = OP(nn, imm);                                 \
2059         }                                                       \
2060     }                                                           \
2061 }
2062 
2063 #define DO_SHR(N, M)  (N >> M)
2064 #define DO_SHL(N, M)  (N << M)
2065 
2066 /* Arithmetic shift right for division.  This rounds negative numbers
2067    toward zero as per signed division.  Therefore before shifting,
2068    when N is negative, add 2**M-1.  */
2069 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2070 
2071 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2072 {
2073     if (likely(sh < 64)) {
2074         return (x >> sh) + ((x >> (sh - 1)) & 1);
2075     } else if (sh == 64) {
2076         return x >> 63;
2077     } else {
2078         return 0;
2079     }
2080 }
2081 
2082 static inline int64_t do_srshr(int64_t x, unsigned sh)
2083 {
2084     if (likely(sh < 64)) {
2085         return (x >> sh) + ((x >> (sh - 1)) & 1);
2086     } else {
2087         /* Rounding the sign bit always produces 0. */
2088         return 0;
2089     }
2090 }
2091 
2092 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2093 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2094 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2095 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2096 
2097 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2098 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2099 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2100 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2101 
2102 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2103 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2104 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2105 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2106 
2107 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2108 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2109 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2110 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2111 
2112 /* SVE2 bitwise shift by immediate */
2113 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2114 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2115 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2116 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2117 
2118 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2119 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2120 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2121 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2122 
2123 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2124 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2125 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2126 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2127 
2128 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2129 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2130 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2131 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2132 
2133 #define do_suqrshl_b(n, m) \
2134    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2135 #define do_suqrshl_h(n, m) \
2136    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2137 #define do_suqrshl_s(n, m) \
2138    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2139 #define do_suqrshl_d(n, m) \
2140    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2141 
2142 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2143 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2144 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2145 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2146 
2147 #undef DO_ASRD
2148 #undef DO_ZPZI
2149 #undef DO_ZPZI_D
2150 
2151 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2152 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2153 {                                                            \
2154     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2155     int shift = simd_data(desc);                             \
2156     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2157         TYPEW nn = *(TYPEW *)(vn + i);                       \
2158         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2159     }                                                        \
2160 }
2161 
2162 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2163 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2164 {                                                                 \
2165     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2166     int shift = simd_data(desc);                                  \
2167     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2168         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2169         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2170     }                                                             \
2171 }
2172 
2173 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2174 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2175 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2176 
2177 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2178 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2179 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2180 
2181 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2182 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2183 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2184 
2185 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2186 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2187 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2188 
2189 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2190 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2191 #define DO_SQSHRUN_D(x, sh) \
2192     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2193 
2194 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2195 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2196 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2197 
2198 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2199 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2200 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2201 
2202 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2203 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2204 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2205 
2206 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2207 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2208 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2209 
2210 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2211 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2212 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2213 
2214 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2215 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2216 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2217 
2218 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2219 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2220 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2221 
2222 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2223 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2224 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2225 
2226 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2227 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2228 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2229 
2230 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2231 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2232 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2233 
2234 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2235 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2236 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2237 
2238 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2239 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2240 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2241 
2242 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2243 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2244 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2245 
2246 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2247 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2248 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2249 
2250 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2251 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2252 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2253 
2254 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2255 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2256 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2257 
2258 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2259 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2260 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2261 
2262 #undef DO_SHRNB
2263 #undef DO_SHRNT
2264 
2265 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2266 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2267 {                                                                           \
2268     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2269     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2270         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2271         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2272         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2273     }                                                                       \
2274 }
2275 
2276 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2277 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2278 {                                                                           \
2279     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2280     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2281         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2282         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2283         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2284     }                                                                       \
2285 }
2286 
2287 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2288 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2289 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2290 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2291 
2292 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2293 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2294 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2295 
2296 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2297 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2298 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2299 
2300 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2301 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2302 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2303 
2304 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2305 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2306 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2307 
2308 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2309 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2310 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2311 
2312 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2313 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2314 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2315 
2316 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2317 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2318 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2319 
2320 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2321 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2322 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2323 
2324 #undef DO_RSUBHN
2325 #undef DO_SUBHN
2326 #undef DO_RADDHN
2327 #undef DO_ADDHN
2328 
2329 #undef DO_BINOPNB
2330 
2331 /* Fully general four-operand expander, controlled by a predicate.
2332  */
2333 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2334 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2335                   void *vg, uint32_t desc)                    \
2336 {                                                             \
2337     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2338     for (i = 0; i < opr_sz; ) {                               \
2339         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2340         do {                                                  \
2341             if (pg & 1) {                                     \
2342                 TYPE nn = *(TYPE *)(vn + H(i));               \
2343                 TYPE mm = *(TYPE *)(vm + H(i));               \
2344                 TYPE aa = *(TYPE *)(va + H(i));               \
2345                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2346             }                                                 \
2347             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2348         } while (i & 15);                                     \
2349     }                                                         \
2350 }
2351 
2352 /* Similarly, specialized for 64-bit operands.  */
2353 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2354 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2355                   void *vg, uint32_t desc)                    \
2356 {                                                             \
2357     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2358     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2359     uint8_t *pg = vg;                                         \
2360     for (i = 0; i < opr_sz; i += 1) {                         \
2361         if (pg[H1(i)] & 1) {                                  \
2362             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2363             d[i] = OP(aa, nn, mm);                            \
2364         }                                                     \
2365     }                                                         \
2366 }
2367 
2368 #define DO_MLA(A, N, M)  (A + N * M)
2369 #define DO_MLS(A, N, M)  (A - N * M)
2370 
2371 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2372 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2373 
2374 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2375 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2376 
2377 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2378 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2379 
2380 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2381 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2382 
2383 #undef DO_MLA
2384 #undef DO_MLS
2385 #undef DO_ZPZZZ
2386 #undef DO_ZPZZZ_D
2387 
2388 void HELPER(sve_index_b)(void *vd, uint32_t start,
2389                          uint32_t incr, uint32_t desc)
2390 {
2391     intptr_t i, opr_sz = simd_oprsz(desc);
2392     uint8_t *d = vd;
2393     for (i = 0; i < opr_sz; i += 1) {
2394         d[H1(i)] = start + i * incr;
2395     }
2396 }
2397 
2398 void HELPER(sve_index_h)(void *vd, uint32_t start,
2399                          uint32_t incr, uint32_t desc)
2400 {
2401     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2402     uint16_t *d = vd;
2403     for (i = 0; i < opr_sz; i += 1) {
2404         d[H2(i)] = start + i * incr;
2405     }
2406 }
2407 
2408 void HELPER(sve_index_s)(void *vd, uint32_t start,
2409                          uint32_t incr, uint32_t desc)
2410 {
2411     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2412     uint32_t *d = vd;
2413     for (i = 0; i < opr_sz; i += 1) {
2414         d[H4(i)] = start + i * incr;
2415     }
2416 }
2417 
2418 void HELPER(sve_index_d)(void *vd, uint64_t start,
2419                          uint64_t incr, uint32_t desc)
2420 {
2421     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2422     uint64_t *d = vd;
2423     for (i = 0; i < opr_sz; i += 1) {
2424         d[i] = start + i * incr;
2425     }
2426 }
2427 
2428 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2429 {
2430     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2431     uint32_t sh = simd_data(desc);
2432     uint32_t *d = vd, *n = vn, *m = vm;
2433     for (i = 0; i < opr_sz; i += 1) {
2434         d[i] = n[i] + (m[i] << sh);
2435     }
2436 }
2437 
2438 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2439 {
2440     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2441     uint64_t sh = simd_data(desc);
2442     uint64_t *d = vd, *n = vn, *m = vm;
2443     for (i = 0; i < opr_sz; i += 1) {
2444         d[i] = n[i] + (m[i] << sh);
2445     }
2446 }
2447 
2448 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2449 {
2450     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2451     uint64_t sh = simd_data(desc);
2452     uint64_t *d = vd, *n = vn, *m = vm;
2453     for (i = 0; i < opr_sz; i += 1) {
2454         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2455     }
2456 }
2457 
2458 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2459 {
2460     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2461     uint64_t sh = simd_data(desc);
2462     uint64_t *d = vd, *n = vn, *m = vm;
2463     for (i = 0; i < opr_sz; i += 1) {
2464         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2465     }
2466 }
2467 
2468 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2469 {
2470     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2471     static const uint16_t coeff[] = {
2472         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2473         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2474         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2475         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2476     };
2477     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2478     uint16_t *d = vd, *n = vn;
2479 
2480     for (i = 0; i < opr_sz; i++) {
2481         uint16_t nn = n[i];
2482         intptr_t idx = extract32(nn, 0, 5);
2483         uint16_t exp = extract32(nn, 5, 5);
2484         d[i] = coeff[idx] | (exp << 10);
2485     }
2486 }
2487 
2488 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2489 {
2490     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2491     static const uint32_t coeff[] = {
2492         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2493         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2494         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2495         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2496         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2497         0x1ef532, 0x20b051, 0x227043, 0x243516,
2498         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2499         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2500         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2501         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2502         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2503         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2504         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2505         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2506         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2507         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2508     };
2509     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2510     uint32_t *d = vd, *n = vn;
2511 
2512     for (i = 0; i < opr_sz; i++) {
2513         uint32_t nn = n[i];
2514         intptr_t idx = extract32(nn, 0, 6);
2515         uint32_t exp = extract32(nn, 6, 8);
2516         d[i] = coeff[idx] | (exp << 23);
2517     }
2518 }
2519 
2520 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2521 {
2522     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2523     static const uint64_t coeff[] = {
2524         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2525         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2526         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2527         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2528         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2529         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2530         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2531         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2532         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2533         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2534         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2535         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2536         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2537         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2538         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2539         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2540         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2541         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2542         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2543         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2544         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2545         0xFA7C1819E90D8ull,
2546     };
2547     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2548     uint64_t *d = vd, *n = vn;
2549 
2550     for (i = 0; i < opr_sz; i++) {
2551         uint64_t nn = n[i];
2552         intptr_t idx = extract32(nn, 0, 6);
2553         uint64_t exp = extract32(nn, 6, 11);
2554         d[i] = coeff[idx] | (exp << 52);
2555     }
2556 }
2557 
2558 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2559 {
2560     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2561     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2562     uint16_t *d = vd, *n = vn, *m = vm;
2563     for (i = 0; i < opr_sz; i += 1) {
2564         uint16_t nn = n[i];
2565         uint16_t mm = m[i];
2566         if (mm & 1) {
2567             nn = float16_one;
2568         }
2569         if (mm & 2) {
2570             nn = float16_maybe_ah_chs(nn, fpcr_ah);
2571         }
2572         d[i] = nn;
2573     }
2574 }
2575 
2576 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2577 {
2578     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2579     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2580     uint32_t *d = vd, *n = vn, *m = vm;
2581     for (i = 0; i < opr_sz; i += 1) {
2582         uint32_t nn = n[i];
2583         uint32_t mm = m[i];
2584         if (mm & 1) {
2585             nn = float32_one;
2586         }
2587         if (mm & 2) {
2588             nn = float32_maybe_ah_chs(nn, fpcr_ah);
2589         }
2590         d[i] = nn;
2591     }
2592 }
2593 
2594 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2595 {
2596     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2597     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2598     uint64_t *d = vd, *n = vn, *m = vm;
2599     for (i = 0; i < opr_sz; i += 1) {
2600         uint64_t nn = n[i];
2601         uint64_t mm = m[i];
2602         if (mm & 1) {
2603             nn = float64_one;
2604         }
2605         if (mm & 2) {
2606             nn = float64_maybe_ah_chs(nn, fpcr_ah);
2607         }
2608         d[i] = nn;
2609     }
2610 }
2611 
2612 /*
2613  * Signed saturating addition with scalar operand.
2614  */
2615 
2616 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2617 {
2618     intptr_t i, oprsz = simd_oprsz(desc);
2619 
2620     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2621         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2622     }
2623 }
2624 
2625 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2626 {
2627     intptr_t i, oprsz = simd_oprsz(desc);
2628 
2629     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2630         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2631     }
2632 }
2633 
2634 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2635 {
2636     intptr_t i, oprsz = simd_oprsz(desc);
2637 
2638     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2639         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2640     }
2641 }
2642 
2643 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2644 {
2645     intptr_t i, oprsz = simd_oprsz(desc);
2646 
2647     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2648         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2649     }
2650 }
2651 
2652 /*
2653  * Unsigned saturating addition with scalar operand.
2654  */
2655 
2656 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2657 {
2658     intptr_t i, oprsz = simd_oprsz(desc);
2659 
2660     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2661         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2662     }
2663 }
2664 
2665 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2666 {
2667     intptr_t i, oprsz = simd_oprsz(desc);
2668 
2669     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2670         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2671     }
2672 }
2673 
2674 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2675 {
2676     intptr_t i, oprsz = simd_oprsz(desc);
2677 
2678     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2679         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2680     }
2681 }
2682 
2683 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2684 {
2685     intptr_t i, oprsz = simd_oprsz(desc);
2686 
2687     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2688         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2689     }
2690 }
2691 
2692 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2693 {
2694     intptr_t i, oprsz = simd_oprsz(desc);
2695 
2696     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2697         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2698     }
2699 }
2700 
2701 /* Two operand predicated copy immediate with merge.  All valid immediates
2702  * can fit within 17 signed bits in the simd_data field.
2703  */
2704 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2705                          uint64_t mm, uint32_t desc)
2706 {
2707     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2708     uint64_t *d = vd, *n = vn;
2709     uint8_t *pg = vg;
2710 
2711     mm = dup_const(MO_8, mm);
2712     for (i = 0; i < opr_sz; i += 1) {
2713         uint64_t nn = n[i];
2714         uint64_t pp = expand_pred_b(pg[H1(i)]);
2715         d[i] = (mm & pp) | (nn & ~pp);
2716     }
2717 }
2718 
2719 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2720                          uint64_t mm, uint32_t desc)
2721 {
2722     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2723     uint64_t *d = vd, *n = vn;
2724     uint8_t *pg = vg;
2725 
2726     mm = dup_const(MO_16, mm);
2727     for (i = 0; i < opr_sz; i += 1) {
2728         uint64_t nn = n[i];
2729         uint64_t pp = expand_pred_h(pg[H1(i)]);
2730         d[i] = (mm & pp) | (nn & ~pp);
2731     }
2732 }
2733 
2734 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2735                          uint64_t mm, uint32_t desc)
2736 {
2737     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2738     uint64_t *d = vd, *n = vn;
2739     uint8_t *pg = vg;
2740 
2741     mm = dup_const(MO_32, mm);
2742     for (i = 0; i < opr_sz; i += 1) {
2743         uint64_t nn = n[i];
2744         uint64_t pp = expand_pred_s(pg[H1(i)]);
2745         d[i] = (mm & pp) | (nn & ~pp);
2746     }
2747 }
2748 
2749 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2750                          uint64_t mm, uint32_t desc)
2751 {
2752     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2753     uint64_t *d = vd, *n = vn;
2754     uint8_t *pg = vg;
2755 
2756     for (i = 0; i < opr_sz; i += 1) {
2757         uint64_t nn = n[i];
2758         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2759     }
2760 }
2761 
2762 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2763 {
2764     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2765     uint64_t *d = vd;
2766     uint8_t *pg = vg;
2767 
2768     val = dup_const(MO_8, val);
2769     for (i = 0; i < opr_sz; i += 1) {
2770         d[i] = val & expand_pred_b(pg[H1(i)]);
2771     }
2772 }
2773 
2774 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2775 {
2776     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2777     uint64_t *d = vd;
2778     uint8_t *pg = vg;
2779 
2780     val = dup_const(MO_16, val);
2781     for (i = 0; i < opr_sz; i += 1) {
2782         d[i] = val & expand_pred_h(pg[H1(i)]);
2783     }
2784 }
2785 
2786 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2787 {
2788     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2789     uint64_t *d = vd;
2790     uint8_t *pg = vg;
2791 
2792     val = dup_const(MO_32, val);
2793     for (i = 0; i < opr_sz; i += 1) {
2794         d[i] = val & expand_pred_s(pg[H1(i)]);
2795     }
2796 }
2797 
2798 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2799 {
2800     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2801     uint64_t *d = vd;
2802     uint8_t *pg = vg;
2803 
2804     for (i = 0; i < opr_sz; i += 1) {
2805         d[i] = (pg[H1(i)] & 1 ? val : 0);
2806     }
2807 }
2808 
2809 /* Big-endian hosts need to frob the byte indices.  If the copy
2810  * happens to be 8-byte aligned, then no frobbing necessary.
2811  */
2812 static void swap_memmove(void *vd, void *vs, size_t n)
2813 {
2814     uintptr_t d = (uintptr_t)vd;
2815     uintptr_t s = (uintptr_t)vs;
2816     uintptr_t o = (d | s | n) & 7;
2817     size_t i;
2818 
2819 #if !HOST_BIG_ENDIAN
2820     o = 0;
2821 #endif
2822     switch (o) {
2823     case 0:
2824         memmove(vd, vs, n);
2825         break;
2826 
2827     case 4:
2828         if (d < s || d >= s + n) {
2829             for (i = 0; i < n; i += 4) {
2830                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2831             }
2832         } else {
2833             for (i = n; i > 0; ) {
2834                 i -= 4;
2835                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2836             }
2837         }
2838         break;
2839 
2840     case 2:
2841     case 6:
2842         if (d < s || d >= s + n) {
2843             for (i = 0; i < n; i += 2) {
2844                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2845             }
2846         } else {
2847             for (i = n; i > 0; ) {
2848                 i -= 2;
2849                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2850             }
2851         }
2852         break;
2853 
2854     default:
2855         if (d < s || d >= s + n) {
2856             for (i = 0; i < n; i++) {
2857                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2858             }
2859         } else {
2860             for (i = n; i > 0; ) {
2861                 i -= 1;
2862                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2863             }
2864         }
2865         break;
2866     }
2867 }
2868 
2869 /* Similarly for memset of 0.  */
2870 static void swap_memzero(void *vd, size_t n)
2871 {
2872     uintptr_t d = (uintptr_t)vd;
2873     uintptr_t o = (d | n) & 7;
2874     size_t i;
2875 
2876     /* Usually, the first bit of a predicate is set, so N is 0.  */
2877     if (likely(n == 0)) {
2878         return;
2879     }
2880 
2881 #if !HOST_BIG_ENDIAN
2882     o = 0;
2883 #endif
2884     switch (o) {
2885     case 0:
2886         memset(vd, 0, n);
2887         break;
2888 
2889     case 4:
2890         for (i = 0; i < n; i += 4) {
2891             *(uint32_t *)H1_4(d + i) = 0;
2892         }
2893         break;
2894 
2895     case 2:
2896     case 6:
2897         for (i = 0; i < n; i += 2) {
2898             *(uint16_t *)H1_2(d + i) = 0;
2899         }
2900         break;
2901 
2902     default:
2903         for (i = 0; i < n; i++) {
2904             *(uint8_t *)H1(d + i) = 0;
2905         }
2906         break;
2907     }
2908 }
2909 
2910 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2911 {
2912     intptr_t opr_sz = simd_oprsz(desc);
2913     size_t n_ofs = simd_data(desc);
2914     size_t n_siz = opr_sz - n_ofs;
2915 
2916     if (vd != vm) {
2917         swap_memmove(vd, vn + n_ofs, n_siz);
2918         swap_memmove(vd + n_siz, vm, n_ofs);
2919     } else if (vd != vn) {
2920         swap_memmove(vd + n_siz, vd, n_ofs);
2921         swap_memmove(vd, vn + n_ofs, n_siz);
2922     } else {
2923         /* vd == vn == vm.  Need temp space.  */
2924         ARMVectorReg tmp;
2925         swap_memmove(&tmp, vm, n_ofs);
2926         swap_memmove(vd, vd + n_ofs, n_siz);
2927         memcpy(vd + n_siz, &tmp, n_ofs);
2928     }
2929 }
2930 
2931 #define DO_INSR(NAME, TYPE, H) \
2932 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2933 {                                                                  \
2934     intptr_t opr_sz = simd_oprsz(desc);                            \
2935     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2936     *(TYPE *)(vd + H(0)) = val;                                    \
2937 }
2938 
2939 DO_INSR(sve_insr_b, uint8_t, H1)
2940 DO_INSR(sve_insr_h, uint16_t, H1_2)
2941 DO_INSR(sve_insr_s, uint32_t, H1_4)
2942 DO_INSR(sve_insr_d, uint64_t, H1_8)
2943 
2944 #undef DO_INSR
2945 
2946 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2947 {
2948     intptr_t i, j, opr_sz = simd_oprsz(desc);
2949     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2950         uint64_t f = *(uint64_t *)(vn + i);
2951         uint64_t b = *(uint64_t *)(vn + j);
2952         *(uint64_t *)(vd + i) = bswap64(b);
2953         *(uint64_t *)(vd + j) = bswap64(f);
2954     }
2955 }
2956 
2957 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2958 {
2959     intptr_t i, j, opr_sz = simd_oprsz(desc);
2960     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2961         uint64_t f = *(uint64_t *)(vn + i);
2962         uint64_t b = *(uint64_t *)(vn + j);
2963         *(uint64_t *)(vd + i) = hswap64(b);
2964         *(uint64_t *)(vd + j) = hswap64(f);
2965     }
2966 }
2967 
2968 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2969 {
2970     intptr_t i, j, opr_sz = simd_oprsz(desc);
2971     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2972         uint64_t f = *(uint64_t *)(vn + i);
2973         uint64_t b = *(uint64_t *)(vn + j);
2974         *(uint64_t *)(vd + i) = rol64(b, 32);
2975         *(uint64_t *)(vd + j) = rol64(f, 32);
2976     }
2977 }
2978 
2979 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2980 {
2981     intptr_t i, j, opr_sz = simd_oprsz(desc);
2982     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2983         uint64_t f = *(uint64_t *)(vn + i);
2984         uint64_t b = *(uint64_t *)(vn + j);
2985         *(uint64_t *)(vd + i) = b;
2986         *(uint64_t *)(vd + j) = f;
2987     }
2988 }
2989 
2990 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2991 
2992 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2993                            bool is_tbx, tb_impl_fn *fn)
2994 {
2995     ARMVectorReg scratch;
2996     uintptr_t oprsz = simd_oprsz(desc);
2997 
2998     if (unlikely(vd == vn)) {
2999         vn = memcpy(&scratch, vn, oprsz);
3000     }
3001 
3002     fn(vd, vn, NULL, vm, oprsz, is_tbx);
3003 }
3004 
3005 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3006                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3007 {
3008     ARMVectorReg scratch;
3009     uintptr_t oprsz = simd_oprsz(desc);
3010 
3011     if (unlikely(vd == vn0)) {
3012         vn0 = memcpy(&scratch, vn0, oprsz);
3013         if (vd == vn1) {
3014             vn1 = vn0;
3015         }
3016     } else if (unlikely(vd == vn1)) {
3017         vn1 = memcpy(&scratch, vn1, oprsz);
3018     }
3019 
3020     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3021 }
3022 
3023 #define DO_TB(SUFF, TYPE, H)                                            \
3024 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
3025                                 void *vm, uintptr_t oprsz, bool is_tbx) \
3026 {                                                                       \
3027     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
3028     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
3029     for (i = 0; i < nelem; ++i) {                                       \
3030         TYPE index = indexes[H1(i)], val = 0;                           \
3031         if (index < nelem) {                                            \
3032             val = tbl0[H(index)];                                       \
3033         } else {                                                        \
3034             index -= nelem;                                             \
3035             if (tbl1 && index < nelem) {                                \
3036                 val = tbl1[H(index)];                                   \
3037             } else if (is_tbx) {                                        \
3038                 continue;                                               \
3039             }                                                           \
3040         }                                                               \
3041         d[H(i)] = val;                                                  \
3042     }                                                                   \
3043 }                                                                       \
3044 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3045 {                                                                       \
3046     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3047 }                                                                       \
3048 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3049                              void *vm, uint32_t desc)                   \
3050 {                                                                       \
3051     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3052 }                                                                       \
3053 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3054 {                                                                       \
3055     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3056 }
3057 
3058 DO_TB(b, uint8_t, H1)
3059 DO_TB(h, uint16_t, H2)
3060 DO_TB(s, uint32_t, H4)
3061 DO_TB(d, uint64_t, H8)
3062 
3063 #undef DO_TB
3064 
3065 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3066 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3067 {                                                              \
3068     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3069     TYPED *d = vd;                                             \
3070     TYPES *n = vn;                                             \
3071     ARMVectorReg tmp;                                          \
3072     if (unlikely(vn - vd < opr_sz)) {                          \
3073         n = memcpy(&tmp, n, opr_sz / 2);                       \
3074     }                                                          \
3075     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3076         d[HD(i)] = n[HS(i)];                                   \
3077     }                                                          \
3078 }
3079 
3080 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3081 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3082 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3083 
3084 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3085 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3086 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3087 
3088 #undef DO_UNPK
3089 
3090 /* Mask of bits included in the even numbered predicates of width esz.
3091  * We also use this for expand_bits/compress_bits, and so extend the
3092  * same pattern out to 16-bit units.
3093  */
3094 static const uint64_t even_bit_esz_masks[5] = {
3095     0x5555555555555555ull,
3096     0x3333333333333333ull,
3097     0x0f0f0f0f0f0f0f0full,
3098     0x00ff00ff00ff00ffull,
3099     0x0000ffff0000ffffull,
3100 };
3101 
3102 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3103  * For N==0, this corresponds to the operation that in qemu/bitops.h
3104  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3105  * section 7-2 Shuffling Bits.
3106  */
3107 static uint64_t expand_bits(uint64_t x, int n)
3108 {
3109     int i;
3110 
3111     x &= 0xffffffffu;
3112     for (i = 4; i >= n; i--) {
3113         int sh = 1 << i;
3114         x = ((x << sh) | x) & even_bit_esz_masks[i];
3115     }
3116     return x;
3117 }
3118 
3119 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3120  * For N==0, this corresponds to the operation that in qemu/bitops.h
3121  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3122  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3123  */
3124 static uint64_t compress_bits(uint64_t x, int n)
3125 {
3126     int i;
3127 
3128     for (i = n; i <= 4; i++) {
3129         int sh = 1 << i;
3130         x &= even_bit_esz_masks[i];
3131         x = (x >> sh) | x;
3132     }
3133     return x & 0xffffffffu;
3134 }
3135 
3136 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3137 {
3138     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3139     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3140     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3141     int esize = 1 << esz;
3142     uint64_t *d = vd;
3143     intptr_t i;
3144 
3145     if (oprsz <= 8) {
3146         uint64_t nn = *(uint64_t *)vn;
3147         uint64_t mm = *(uint64_t *)vm;
3148         int half = 4 * oprsz;
3149 
3150         nn = extract64(nn, high * half, half);
3151         mm = extract64(mm, high * half, half);
3152         nn = expand_bits(nn, esz);
3153         mm = expand_bits(mm, esz);
3154         d[0] = nn | (mm << esize);
3155     } else {
3156         ARMPredicateReg tmp;
3157 
3158         /* We produce output faster than we consume input.
3159            Therefore we must be mindful of possible overlap.  */
3160         if (vd == vn) {
3161             vn = memcpy(&tmp, vn, oprsz);
3162             if (vd == vm) {
3163                 vm = vn;
3164             }
3165         } else if (vd == vm) {
3166             vm = memcpy(&tmp, vm, oprsz);
3167         }
3168         if (high) {
3169             high = oprsz >> 1;
3170         }
3171 
3172         if ((oprsz & 7) == 0) {
3173             uint32_t *n = vn, *m = vm;
3174             high >>= 2;
3175 
3176             for (i = 0; i < oprsz / 8; i++) {
3177                 uint64_t nn = n[H4(high + i)];
3178                 uint64_t mm = m[H4(high + i)];
3179 
3180                 nn = expand_bits(nn, esz);
3181                 mm = expand_bits(mm, esz);
3182                 d[i] = nn | (mm << esize);
3183             }
3184         } else {
3185             uint8_t *n = vn, *m = vm;
3186             uint16_t *d16 = vd;
3187 
3188             for (i = 0; i < oprsz / 2; i++) {
3189                 uint16_t nn = n[H1(high + i)];
3190                 uint16_t mm = m[H1(high + i)];
3191 
3192                 nn = expand_bits(nn, esz);
3193                 mm = expand_bits(mm, esz);
3194                 d16[H2(i)] = nn | (mm << esize);
3195             }
3196         }
3197     }
3198 }
3199 
3200 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3201 {
3202     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3203     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3204     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3205     uint64_t *d = vd, *n = vn, *m = vm;
3206     uint64_t l, h;
3207     intptr_t i;
3208 
3209     if (oprsz <= 8) {
3210         l = compress_bits(n[0] >> odd, esz);
3211         h = compress_bits(m[0] >> odd, esz);
3212         d[0] = l | (h << (4 * oprsz));
3213     } else {
3214         ARMPredicateReg tmp_m;
3215         intptr_t oprsz_16 = oprsz / 16;
3216 
3217         if ((vm - vd) < (uintptr_t)oprsz) {
3218             m = memcpy(&tmp_m, vm, oprsz);
3219         }
3220 
3221         for (i = 0; i < oprsz_16; i++) {
3222             l = n[2 * i + 0];
3223             h = n[2 * i + 1];
3224             l = compress_bits(l >> odd, esz);
3225             h = compress_bits(h >> odd, esz);
3226             d[i] = l | (h << 32);
3227         }
3228 
3229         /*
3230          * For VL which is not a multiple of 512, the results from M do not
3231          * align nicely with the uint64_t for D.  Put the aligned results
3232          * from M into TMP_M and then copy it into place afterward.
3233          */
3234         if (oprsz & 15) {
3235             int final_shift = (oprsz & 15) * 2;
3236 
3237             l = n[2 * i + 0];
3238             h = n[2 * i + 1];
3239             l = compress_bits(l >> odd, esz);
3240             h = compress_bits(h >> odd, esz);
3241             d[i] = l | (h << final_shift);
3242 
3243             for (i = 0; i < oprsz_16; i++) {
3244                 l = m[2 * i + 0];
3245                 h = m[2 * i + 1];
3246                 l = compress_bits(l >> odd, esz);
3247                 h = compress_bits(h >> odd, esz);
3248                 tmp_m.p[i] = l | (h << 32);
3249             }
3250             l = m[2 * i + 0];
3251             h = m[2 * i + 1];
3252             l = compress_bits(l >> odd, esz);
3253             h = compress_bits(h >> odd, esz);
3254             tmp_m.p[i] = l | (h << final_shift);
3255 
3256             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3257         } else {
3258             for (i = 0; i < oprsz_16; i++) {
3259                 l = m[2 * i + 0];
3260                 h = m[2 * i + 1];
3261                 l = compress_bits(l >> odd, esz);
3262                 h = compress_bits(h >> odd, esz);
3263                 d[oprsz_16 + i] = l | (h << 32);
3264             }
3265         }
3266     }
3267 }
3268 
3269 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3270 {
3271     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3272     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3273     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3274     uint64_t *d = vd, *n = vn, *m = vm;
3275     uint64_t mask;
3276     int shr, shl;
3277     intptr_t i;
3278 
3279     shl = 1 << esz;
3280     shr = 0;
3281     mask = even_bit_esz_masks[esz];
3282     if (odd) {
3283         mask <<= shl;
3284         shr = shl;
3285         shl = 0;
3286     }
3287 
3288     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3289         uint64_t nn = (n[i] & mask) >> shr;
3290         uint64_t mm = (m[i] & mask) << shl;
3291         d[i] = nn + mm;
3292     }
3293 }
3294 
3295 /* Reverse units of 2**N bits.  */
3296 static uint64_t reverse_bits_64(uint64_t x, int n)
3297 {
3298     int i, sh;
3299 
3300     x = bswap64(x);
3301     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3302         uint64_t mask = even_bit_esz_masks[i];
3303         x = ((x & mask) << sh) | ((x >> sh) & mask);
3304     }
3305     return x;
3306 }
3307 
3308 static uint8_t reverse_bits_8(uint8_t x, int n)
3309 {
3310     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3311     int i, sh;
3312 
3313     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3314         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3315     }
3316     return x;
3317 }
3318 
3319 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3320 {
3321     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3322     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3323     intptr_t i, oprsz_2 = oprsz / 2;
3324 
3325     if (oprsz <= 8) {
3326         uint64_t l = *(uint64_t *)vn;
3327         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3328         *(uint64_t *)vd = l;
3329     } else if ((oprsz & 15) == 0) {
3330         for (i = 0; i < oprsz_2; i += 8) {
3331             intptr_t ih = oprsz - 8 - i;
3332             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3333             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3334             *(uint64_t *)(vd + i) = h;
3335             *(uint64_t *)(vd + ih) = l;
3336         }
3337     } else {
3338         for (i = 0; i < oprsz_2; i += 1) {
3339             intptr_t il = H1(i);
3340             intptr_t ih = H1(oprsz - 1 - i);
3341             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3342             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3343             *(uint8_t *)(vd + il) = h;
3344             *(uint8_t *)(vd + ih) = l;
3345         }
3346     }
3347 }
3348 
3349 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3350 {
3351     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3352     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3353     uint64_t *d = vd;
3354     intptr_t i;
3355 
3356     if (oprsz <= 8) {
3357         uint64_t nn = *(uint64_t *)vn;
3358         int half = 4 * oprsz;
3359 
3360         nn = extract64(nn, high * half, half);
3361         nn = expand_bits(nn, 0);
3362         d[0] = nn;
3363     } else {
3364         ARMPredicateReg tmp_n;
3365 
3366         /* We produce output faster than we consume input.
3367            Therefore we must be mindful of possible overlap.  */
3368         if ((vn - vd) < (uintptr_t)oprsz) {
3369             vn = memcpy(&tmp_n, vn, oprsz);
3370         }
3371         if (high) {
3372             high = oprsz >> 1;
3373         }
3374 
3375         if ((oprsz & 7) == 0) {
3376             uint32_t *n = vn;
3377             high >>= 2;
3378 
3379             for (i = 0; i < oprsz / 8; i++) {
3380                 uint64_t nn = n[H4(high + i)];
3381                 d[i] = expand_bits(nn, 0);
3382             }
3383         } else {
3384             uint16_t *d16 = vd;
3385             uint8_t *n = vn;
3386 
3387             for (i = 0; i < oprsz / 2; i++) {
3388                 uint16_t nn = n[H1(high + i)];
3389                 d16[H2(i)] = expand_bits(nn, 0);
3390             }
3391         }
3392     }
3393 }
3394 
3395 #define DO_ZIP(NAME, TYPE, H) \
3396 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3397 {                                                                    \
3398     intptr_t oprsz = simd_oprsz(desc);                               \
3399     intptr_t odd_ofs = simd_data(desc);                              \
3400     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3401     ARMVectorReg tmp_n, tmp_m;                                       \
3402     /* We produce output faster than we consume input.               \
3403        Therefore we must be mindful of possible overlap.  */         \
3404     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3405         vn = memcpy(&tmp_n, vn, oprsz);                              \
3406     }                                                                \
3407     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3408         vm = memcpy(&tmp_m, vm, oprsz);                              \
3409     }                                                                \
3410     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3411         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3412         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3413             *(TYPE *)(vm + odd_ofs + H(i));                          \
3414     }                                                                \
3415     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3416         memset(vd + oprsz - 16, 0, 16);                              \
3417     }                                                                \
3418 }
3419 
3420 DO_ZIP(sve_zip_b, uint8_t, H1)
3421 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3422 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3423 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3424 DO_ZIP(sve2_zip_q, Int128, )
3425 
3426 #define DO_UZP(NAME, TYPE, H) \
3427 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3428 {                                                                      \
3429     intptr_t oprsz = simd_oprsz(desc);                                 \
3430     intptr_t odd_ofs = simd_data(desc);                                \
3431     intptr_t i, p;                                                     \
3432     ARMVectorReg tmp_m;                                                \
3433     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3434         vm = memcpy(&tmp_m, vm, oprsz);                                \
3435     }                                                                  \
3436     i = 0, p = odd_ofs;                                                \
3437     do {                                                               \
3438         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3439         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3440     } while (p < oprsz);                                               \
3441     p -= oprsz;                                                        \
3442     do {                                                               \
3443         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3444         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3445     } while (p < oprsz);                                               \
3446     tcg_debug_assert(i == oprsz);                                      \
3447 }
3448 
3449 DO_UZP(sve_uzp_b, uint8_t, H1)
3450 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3451 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3452 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3453 DO_UZP(sve2_uzp_q, Int128, )
3454 
3455 #define DO_TRN(NAME, TYPE, H) \
3456 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3457 {                                                                      \
3458     intptr_t oprsz = simd_oprsz(desc);                                 \
3459     intptr_t odd_ofs = simd_data(desc);                                \
3460     intptr_t i;                                                        \
3461     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3462         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3463         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3464         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3465         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3466     }                                                                  \
3467     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3468         memset(vd + oprsz - 16, 0, 16);                                \
3469     }                                                                  \
3470 }
3471 
3472 DO_TRN(sve_trn_b, uint8_t, H1)
3473 DO_TRN(sve_trn_h, uint16_t, H1_2)
3474 DO_TRN(sve_trn_s, uint32_t, H1_4)
3475 DO_TRN(sve_trn_d, uint64_t, H1_8)
3476 DO_TRN(sve2_trn_q, Int128, )
3477 
3478 #undef DO_ZIP
3479 #undef DO_UZP
3480 #undef DO_TRN
3481 
3482 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3483 {
3484     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3485     uint32_t *d = vd, *n = vn;
3486     uint8_t *pg = vg;
3487 
3488     for (i = j = 0; i < opr_sz; i++) {
3489         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3490             d[H4(j)] = n[H4(i)];
3491             j++;
3492         }
3493     }
3494     for (; j < opr_sz; j++) {
3495         d[H4(j)] = 0;
3496     }
3497 }
3498 
3499 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3500 {
3501     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3502     uint64_t *d = vd, *n = vn;
3503     uint8_t *pg = vg;
3504 
3505     for (i = j = 0; i < opr_sz; i++) {
3506         if (pg[H1(i)] & 1) {
3507             d[j] = n[i];
3508             j++;
3509         }
3510     }
3511     for (; j < opr_sz; j++) {
3512         d[j] = 0;
3513     }
3514 }
3515 
3516 /* Similar to the ARM LastActiveElement pseudocode function, except the
3517  * result is multiplied by the element size.  This includes the not found
3518  * indication; e.g. not found for esz=3 is -8.
3519  */
3520 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3521 {
3522     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3523     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3524 
3525     return last_active_element(vg, words, esz);
3526 }
3527 
3528 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3529 {
3530     intptr_t opr_sz = simd_oprsz(desc) / 8;
3531     int esz = simd_data(desc);
3532     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3533     intptr_t i, first_i, last_i;
3534     ARMVectorReg tmp;
3535 
3536     first_i = last_i = 0;
3537     first_g = last_g = 0;
3538 
3539     /* Find the extent of the active elements within VG.  */
3540     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3541         pg = *(uint64_t *)(vg + i) & mask;
3542         if (pg) {
3543             if (last_g == 0) {
3544                 last_g = pg;
3545                 last_i = i;
3546             }
3547             first_g = pg;
3548             first_i = i;
3549         }
3550     }
3551 
3552     len = 0;
3553     if (first_g != 0) {
3554         first_i = first_i * 8 + ctz64(first_g);
3555         last_i = last_i * 8 + 63 - clz64(last_g);
3556         len = last_i - first_i + (1 << esz);
3557         if (vd == vm) {
3558             vm = memcpy(&tmp, vm, opr_sz * 8);
3559         }
3560         swap_memmove(vd, vn + first_i, len);
3561     }
3562     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3563 }
3564 
3565 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3566                             void *vg, uint32_t desc)
3567 {
3568     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3569     uint64_t *d = vd, *n = vn, *m = vm;
3570     uint8_t *pg = vg;
3571 
3572     for (i = 0; i < opr_sz; i += 1) {
3573         uint64_t nn = n[i], mm = m[i];
3574         uint64_t pp = expand_pred_b(pg[H1(i)]);
3575         d[i] = (nn & pp) | (mm & ~pp);
3576     }
3577 }
3578 
3579 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3580                             void *vg, uint32_t desc)
3581 {
3582     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3583     uint64_t *d = vd, *n = vn, *m = vm;
3584     uint8_t *pg = vg;
3585 
3586     for (i = 0; i < opr_sz; i += 1) {
3587         uint64_t nn = n[i], mm = m[i];
3588         uint64_t pp = expand_pred_h(pg[H1(i)]);
3589         d[i] = (nn & pp) | (mm & ~pp);
3590     }
3591 }
3592 
3593 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3594                             void *vg, uint32_t desc)
3595 {
3596     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3597     uint64_t *d = vd, *n = vn, *m = vm;
3598     uint8_t *pg = vg;
3599 
3600     for (i = 0; i < opr_sz; i += 1) {
3601         uint64_t nn = n[i], mm = m[i];
3602         uint64_t pp = expand_pred_s(pg[H1(i)]);
3603         d[i] = (nn & pp) | (mm & ~pp);
3604     }
3605 }
3606 
3607 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3608                             void *vg, uint32_t desc)
3609 {
3610     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3611     uint64_t *d = vd, *n = vn, *m = vm;
3612     uint8_t *pg = vg;
3613 
3614     for (i = 0; i < opr_sz; i += 1) {
3615         uint64_t nn = n[i], mm = m[i];
3616         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3617     }
3618 }
3619 
3620 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3621                             void *vg, uint32_t desc)
3622 {
3623     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3624     Int128 *d = vd, *n = vn, *m = vm;
3625     uint16_t *pg = vg;
3626 
3627     for (i = 0; i < opr_sz; i += 1) {
3628         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3629     }
3630 }
3631 
3632 /* Two operand comparison controlled by a predicate.
3633  * ??? It is very tempting to want to be able to expand this inline
3634  * with x86 instructions, e.g.
3635  *
3636  *    vcmpeqw    zm, zn, %ymm0
3637  *    vpmovmskb  %ymm0, %eax
3638  *    and        $0x5555, %eax
3639  *    and        pg, %eax
3640  *
3641  * or even aarch64, e.g.
3642  *
3643  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3644  *    cmeq       v0.8h, zn, zm
3645  *    and        v0.8h, v0.8h, mask
3646  *    addv       h0, v0.8h
3647  *    and        v0.8b, pg
3648  *
3649  * However, coming up with an abstraction that allows vector inputs and
3650  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3651  * scalar outputs, is tricky.
3652  */
3653 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3654 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3655 {                                                                            \
3656     intptr_t opr_sz = simd_oprsz(desc);                                      \
3657     uint32_t flags = PREDTEST_INIT;                                          \
3658     intptr_t i = opr_sz;                                                     \
3659     do {                                                                     \
3660         uint64_t out = 0, pg;                                                \
3661         do {                                                                 \
3662             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3663             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3664             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3665             out |= nn OP mm;                                                 \
3666         } while (i & 63);                                                    \
3667         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3668         out &= pg;                                                           \
3669         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3670         flags = iter_predtest_bwd(out, pg, flags);                           \
3671     } while (i > 0);                                                         \
3672     return flags;                                                            \
3673 }
3674 
3675 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3676     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3677 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3678     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3679 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3680     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3681 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3682     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3683 
3684 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3685 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3686 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3687 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3688 
3689 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3690 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3691 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3692 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3693 
3694 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3695 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3696 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3697 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3698 
3699 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3700 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3701 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3702 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3703 
3704 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3705 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3706 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3707 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3708 
3709 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3710 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3711 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3712 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3713 
3714 #undef DO_CMP_PPZZ_B
3715 #undef DO_CMP_PPZZ_H
3716 #undef DO_CMP_PPZZ_S
3717 #undef DO_CMP_PPZZ_D
3718 #undef DO_CMP_PPZZ
3719 
3720 /* Similar, but the second source is "wide".  */
3721 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3722 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3723 {                                                                            \
3724     intptr_t opr_sz = simd_oprsz(desc);                                      \
3725     uint32_t flags = PREDTEST_INIT;                                          \
3726     intptr_t i = opr_sz;                                                     \
3727     do {                                                                     \
3728         uint64_t out = 0, pg;                                                \
3729         do {                                                                 \
3730             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3731             do {                                                             \
3732                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3733                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3734                 out |= nn OP mm;                                             \
3735             } while (i & 7);                                                 \
3736         } while (i & 63);                                                    \
3737         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3738         out &= pg;                                                           \
3739         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3740         flags = iter_predtest_bwd(out, pg, flags);                           \
3741     } while (i > 0);                                                         \
3742     return flags;                                                            \
3743 }
3744 
3745 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3746     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3747 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3748     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3749 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3750     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3751 
3752 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3753 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3754 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3755 
3756 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3757 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3758 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3759 
3760 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3761 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3762 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3763 
3764 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3765 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3766 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3767 
3768 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3769 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3770 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3771 
3772 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3773 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3774 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3775 
3776 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3777 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3778 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3779 
3780 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3781 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3782 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3783 
3784 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3785 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3786 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3787 
3788 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3789 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3790 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3791 
3792 #undef DO_CMP_PPZW_B
3793 #undef DO_CMP_PPZW_H
3794 #undef DO_CMP_PPZW_S
3795 #undef DO_CMP_PPZW
3796 
3797 /* Similar, but the second source is immediate.  */
3798 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3799 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3800 {                                                                    \
3801     intptr_t opr_sz = simd_oprsz(desc);                              \
3802     uint32_t flags = PREDTEST_INIT;                                  \
3803     TYPE mm = simd_data(desc);                                       \
3804     intptr_t i = opr_sz;                                             \
3805     do {                                                             \
3806         uint64_t out = 0, pg;                                        \
3807         do {                                                         \
3808             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3809             TYPE nn = *(TYPE *)(vn + H(i));                          \
3810             out |= nn OP mm;                                         \
3811         } while (i & 63);                                            \
3812         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3813         out &= pg;                                                   \
3814         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3815         flags = iter_predtest_bwd(out, pg, flags);                   \
3816     } while (i > 0);                                                 \
3817     return flags;                                                    \
3818 }
3819 
3820 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3821     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3822 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3823     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3824 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3825     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3826 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3827     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3828 
3829 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3830 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3831 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3832 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3833 
3834 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3835 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3836 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3837 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3838 
3839 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3840 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3841 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3842 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3843 
3844 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3845 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3846 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3847 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3848 
3849 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3850 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3851 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3852 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3853 
3854 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3855 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3856 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3857 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3858 
3859 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3860 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3861 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3862 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3863 
3864 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3865 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3866 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3867 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3868 
3869 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3870 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3871 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3872 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3873 
3874 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3875 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3876 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3877 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3878 
3879 #undef DO_CMP_PPZI_B
3880 #undef DO_CMP_PPZI_H
3881 #undef DO_CMP_PPZI_S
3882 #undef DO_CMP_PPZI_D
3883 #undef DO_CMP_PPZI
3884 
3885 /* Similar to the ARM LastActive pseudocode function.  */
3886 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3887 {
3888     intptr_t i;
3889 
3890     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3891         uint64_t pg = *(uint64_t *)(vg + i);
3892         if (pg) {
3893             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3894         }
3895     }
3896     return 0;
3897 }
3898 
3899 /* Compute a mask into RETB that is true for all G, up to and including
3900  * (if after) or excluding (if !after) the first G & N.
3901  * Return true if BRK found.
3902  */
3903 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3904                         bool brk, bool after)
3905 {
3906     uint64_t b;
3907 
3908     if (brk) {
3909         b = 0;
3910     } else if ((g & n) == 0) {
3911         /* For all G, no N are set; break not found.  */
3912         b = g;
3913     } else {
3914         /* Break somewhere in N.  Locate it.  */
3915         b = g & n;            /* guard true, pred true */
3916         b = b & -b;           /* first such */
3917         if (after) {
3918             b = b | (b - 1);  /* break after same */
3919         } else {
3920             b = b - 1;        /* break before same */
3921         }
3922         brk = true;
3923     }
3924 
3925     *retb = b;
3926     return brk;
3927 }
3928 
3929 /* Compute a zeroing BRK.  */
3930 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3931                           intptr_t oprsz, bool after)
3932 {
3933     bool brk = false;
3934     intptr_t i;
3935 
3936     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3937         uint64_t this_b, this_g = g[i];
3938 
3939         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3940         d[i] = this_b & this_g;
3941     }
3942 }
3943 
3944 /* Likewise, but also compute flags.  */
3945 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3946                                intptr_t oprsz, bool after)
3947 {
3948     uint32_t flags = PREDTEST_INIT;
3949     bool brk = false;
3950     intptr_t i;
3951 
3952     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3953         uint64_t this_b, this_d, this_g = g[i];
3954 
3955         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3956         d[i] = this_d = this_b & this_g;
3957         flags = iter_predtest_fwd(this_d, this_g, flags);
3958     }
3959     return flags;
3960 }
3961 
3962 /* Compute a merging BRK.  */
3963 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3964                           intptr_t oprsz, bool after)
3965 {
3966     bool brk = false;
3967     intptr_t i;
3968 
3969     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3970         uint64_t this_b, this_g = g[i];
3971 
3972         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3973         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3974     }
3975 }
3976 
3977 /* Likewise, but also compute flags.  */
3978 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3979                                intptr_t oprsz, bool after)
3980 {
3981     uint32_t flags = PREDTEST_INIT;
3982     bool brk = false;
3983     intptr_t i;
3984 
3985     for (i = 0; i < oprsz / 8; ++i) {
3986         uint64_t this_b, this_d = d[i], this_g = g[i];
3987 
3988         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3989         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3990         flags = iter_predtest_fwd(this_d, this_g, flags);
3991     }
3992     return flags;
3993 }
3994 
3995 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3996 {
3997     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3998      * The compiler should turn this into 4 64-bit integer stores.
3999      */
4000     memset(d, 0, sizeof(ARMPredicateReg));
4001     return PREDTEST_INIT;
4002 }
4003 
4004 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4005                        uint32_t pred_desc)
4006 {
4007     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4008     if (last_active_pred(vn, vg, oprsz)) {
4009         compute_brk_z(vd, vm, vg, oprsz, true);
4010     } else {
4011         do_zero(vd, oprsz);
4012     }
4013 }
4014 
4015 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4016                             uint32_t pred_desc)
4017 {
4018     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4019     if (last_active_pred(vn, vg, oprsz)) {
4020         return compute_brks_z(vd, vm, vg, oprsz, true);
4021     } else {
4022         return do_zero(vd, oprsz);
4023     }
4024 }
4025 
4026 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4027                        uint32_t pred_desc)
4028 {
4029     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4030     if (last_active_pred(vn, vg, oprsz)) {
4031         compute_brk_z(vd, vm, vg, oprsz, false);
4032     } else {
4033         do_zero(vd, oprsz);
4034     }
4035 }
4036 
4037 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4038                             uint32_t pred_desc)
4039 {
4040     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4041     if (last_active_pred(vn, vg, oprsz)) {
4042         return compute_brks_z(vd, vm, vg, oprsz, false);
4043     } else {
4044         return do_zero(vd, oprsz);
4045     }
4046 }
4047 
4048 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4049 {
4050     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4051     compute_brk_z(vd, vn, vg, oprsz, true);
4052 }
4053 
4054 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4055 {
4056     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4057     return compute_brks_z(vd, vn, vg, oprsz, true);
4058 }
4059 
4060 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4061 {
4062     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4063     compute_brk_z(vd, vn, vg, oprsz, false);
4064 }
4065 
4066 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4067 {
4068     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4069     return compute_brks_z(vd, vn, vg, oprsz, false);
4070 }
4071 
4072 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4073 {
4074     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4075     compute_brk_m(vd, vn, vg, oprsz, true);
4076 }
4077 
4078 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4079 {
4080     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4081     return compute_brks_m(vd, vn, vg, oprsz, true);
4082 }
4083 
4084 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4085 {
4086     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4087     compute_brk_m(vd, vn, vg, oprsz, false);
4088 }
4089 
4090 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4091 {
4092     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4093     return compute_brks_m(vd, vn, vg, oprsz, false);
4094 }
4095 
4096 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4097 {
4098     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4099     if (!last_active_pred(vn, vg, oprsz)) {
4100         do_zero(vd, oprsz);
4101     }
4102 }
4103 
4104 /* As if PredTest(Ones(PL), D, esz).  */
4105 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4106                               uint64_t esz_mask)
4107 {
4108     uint32_t flags = PREDTEST_INIT;
4109     intptr_t i;
4110 
4111     for (i = 0; i < oprsz / 8; i++) {
4112         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4113     }
4114     if (oprsz & 7) {
4115         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4116         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4117     }
4118     return flags;
4119 }
4120 
4121 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4122 {
4123     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4124     if (last_active_pred(vn, vg, oprsz)) {
4125         return predtest_ones(vd, oprsz, -1);
4126     } else {
4127         return do_zero(vd, oprsz);
4128     }
4129 }
4130 
4131 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4132 {
4133     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4134     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4135     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4136     intptr_t i;
4137 
4138     for (i = 0; i < words; ++i) {
4139         uint64_t t = n[i] & g[i] & mask;
4140         sum += ctpop64(t);
4141     }
4142     return sum;
4143 }
4144 
4145 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4146 {
4147     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4148     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4149     uint64_t esz_mask = pred_esz_masks[esz];
4150     ARMPredicateReg *d = vd;
4151     uint32_t flags;
4152     intptr_t i;
4153 
4154     /* Begin with a zero predicate register.  */
4155     flags = do_zero(d, oprsz);
4156     if (count == 0) {
4157         return flags;
4158     }
4159 
4160     /* Set all of the requested bits.  */
4161     for (i = 0; i < count / 64; ++i) {
4162         d->p[i] = esz_mask;
4163     }
4164     if (count & 63) {
4165         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4166     }
4167 
4168     return predtest_ones(d, oprsz, esz_mask);
4169 }
4170 
4171 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4172 {
4173     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4174     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4175     uint64_t esz_mask = pred_esz_masks[esz];
4176     ARMPredicateReg *d = vd;
4177     intptr_t i, invcount, oprbits;
4178     uint64_t bits;
4179 
4180     if (count == 0) {
4181         return do_zero(d, oprsz);
4182     }
4183 
4184     oprbits = oprsz * 8;
4185     tcg_debug_assert(count <= oprbits);
4186 
4187     bits = esz_mask;
4188     if (oprbits & 63) {
4189         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4190     }
4191 
4192     invcount = oprbits - count;
4193     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4194         d->p[i] = bits;
4195         bits = esz_mask;
4196     }
4197 
4198     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4199 
4200     while (--i >= 0) {
4201         d->p[i] = 0;
4202     }
4203 
4204     return predtest_ones(d, oprsz, esz_mask);
4205 }
4206 
4207 /* Recursive reduction on a function;
4208  * C.f. the ARM ARM function ReducePredicated.
4209  *
4210  * While it would be possible to write this without the DATA temporary,
4211  * it is much simpler to process the predicate register this way.
4212  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4213  * little to gain with a more complex non-recursive form.
4214  */
4215 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4216 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4217 {                                                                     \
4218     if (n == 1) {                                                     \
4219         return *data;                                                 \
4220     } else {                                                          \
4221         uintptr_t half = n / 2;                                       \
4222         TYPE lo = NAME##_reduce(data, status, half);                  \
4223         TYPE hi = NAME##_reduce(data + half, status, half);           \
4224         return FUNC(lo, hi, status);                                  \
4225     }                                                                 \
4226 }                                                                     \
4227 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
4228 {                                                                     \
4229     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4230     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4231     for (i = 0; i < oprsz; ) {                                        \
4232         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4233         do {                                                          \
4234             TYPE nn = *(TYPE *)(vn + H(i));                           \
4235             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4236             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4237         } while (i & 15);                                             \
4238     }                                                                 \
4239     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4240         *(TYPE *)((void *)data + i) = IDENT;                          \
4241     }                                                                 \
4242     return NAME##_reduce(data, s, maxsz / sizeof(TYPE));              \
4243 }
4244 
4245 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
4246 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
4247 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
4248 
4249 /* Identity is floatN_default_nan, without the function call.  */
4250 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
4251 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
4252 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
4253 
4254 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
4255 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
4256 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
4257 
4258 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
4259 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
4260 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
4261 
4262 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4263 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4264 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4265 
4266 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4267 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4268 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4269 
4270 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
4271           float16_chs(float16_infinity))
4272 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
4273           float32_chs(float32_infinity))
4274 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
4275           float64_chs(float64_infinity))
4276 
4277 #undef DO_REDUCE
4278 
4279 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4280                              float_status *status, uint32_t desc)
4281 {
4282     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4283     float16 result = nn;
4284 
4285     do {
4286         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4287         do {
4288             if (pg & 1) {
4289                 float16 mm = *(float16 *)(vm + H1_2(i));
4290                 result = float16_add(result, mm, status);
4291             }
4292             i += sizeof(float16), pg >>= sizeof(float16);
4293         } while (i & 15);
4294     } while (i < opr_sz);
4295 
4296     return result;
4297 }
4298 
4299 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4300                              float_status *status, uint32_t desc)
4301 {
4302     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4303     float32 result = nn;
4304 
4305     do {
4306         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4307         do {
4308             if (pg & 1) {
4309                 float32 mm = *(float32 *)(vm + H1_2(i));
4310                 result = float32_add(result, mm, status);
4311             }
4312             i += sizeof(float32), pg >>= sizeof(float32);
4313         } while (i & 15);
4314     } while (i < opr_sz);
4315 
4316     return result;
4317 }
4318 
4319 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4320                              float_status *status, uint32_t desc)
4321 {
4322     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4323     uint64_t *m = vm;
4324     uint8_t *pg = vg;
4325 
4326     for (i = 0; i < opr_sz; i++) {
4327         if (pg[H1(i)] & 1) {
4328             nn = float64_add(nn, m[i], status);
4329         }
4330     }
4331 
4332     return nn;
4333 }
4334 
4335 /* Fully general three-operand expander, controlled by a predicate,
4336  * With the extra float_status parameter.
4337  */
4338 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4339 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4340                   float_status *status, uint32_t desc)          \
4341 {                                                               \
4342     intptr_t i = simd_oprsz(desc);                              \
4343     uint64_t *g = vg;                                           \
4344     do {                                                        \
4345         uint64_t pg = g[(i - 1) >> 6];                          \
4346         do {                                                    \
4347             i -= sizeof(TYPE);                                  \
4348             if (likely((pg >> (i & 63)) & 1)) {                 \
4349                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4350                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4351                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4352             }                                                   \
4353         } while (i & 63);                                       \
4354     } while (i != 0);                                           \
4355 }
4356 
4357 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4358 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4359 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4360 
4361 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4362 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4363 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4364 
4365 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4366 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4367 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4368 
4369 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4370 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4371 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4372 
4373 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4374 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4375 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4376 
4377 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4378 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4379 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4380 
4381 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4382 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4383 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4384 
4385 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4386 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4387 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4388 
4389 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4390 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4391 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4392 
4393 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4394 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4395 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4396 
4397 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4398 {
4399     return float16_abs(float16_sub(a, b, s));
4400 }
4401 
4402 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4403 {
4404     return float32_abs(float32_sub(a, b, s));
4405 }
4406 
4407 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4408 {
4409     return float64_abs(float64_sub(a, b, s));
4410 }
4411 
4412 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
4413 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4414 {
4415     float16 r = float16_sub(op1, op2, stat);
4416     return float16_is_any_nan(r) ? r : float16_abs(r);
4417 }
4418 
4419 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4420 {
4421     float32 r = float32_sub(op1, op2, stat);
4422     return float32_is_any_nan(r) ? r : float32_abs(r);
4423 }
4424 
4425 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4426 {
4427     float64 r = float64_sub(op1, op2, stat);
4428     return float64_is_any_nan(r) ? r : float64_abs(r);
4429 }
4430 
4431 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4432 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4433 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4434 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4435 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4436 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4437 
4438 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4439 {
4440     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4441     return float64_scalbn(a, b_int, s);
4442 }
4443 
4444 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4445 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4446 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4447 
4448 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4449 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4450 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4451 
4452 #undef DO_ZPZZ_FP
4453 
4454 /* Three-operand expander, with one scalar operand, controlled by
4455  * a predicate, with the extra float_status parameter.
4456  */
4457 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4458 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4459                   float_status *status, uint32_t desc)            \
4460 {                                                                 \
4461     intptr_t i = simd_oprsz(desc);                                \
4462     uint64_t *g = vg;                                             \
4463     TYPE mm = scalar;                                             \
4464     do {                                                          \
4465         uint64_t pg = g[(i - 1) >> 6];                            \
4466         do {                                                      \
4467             i -= sizeof(TYPE);                                    \
4468             if (likely((pg >> (i & 63)) & 1)) {                   \
4469                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4470                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4471             }                                                     \
4472         } while (i & 63);                                         \
4473     } while (i != 0);                                             \
4474 }
4475 
4476 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4477 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4478 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4479 
4480 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4481 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4482 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4483 
4484 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4485 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4486 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4487 
4488 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4489 {
4490     return float16_sub(b, a, s);
4491 }
4492 
4493 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4494 {
4495     return float32_sub(b, a, s);
4496 }
4497 
4498 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4499 {
4500     return float64_sub(b, a, s);
4501 }
4502 
4503 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4504 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4505 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4506 
4507 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4508 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4509 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4510 
4511 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4512 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4513 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4514 
4515 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4516 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4517 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4518 
4519 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4520 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4521 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4522 
4523 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4524 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4525 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4526 
4527 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4528 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4529 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4530 
4531 /* Fully general two-operand expander, controlled by a predicate,
4532  * With the extra float_status parameter.
4533  */
4534 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4535 void HELPER(NAME)(void *vd, void *vn, void *vg,                       \
4536                   float_status *status, uint32_t desc)                \
4537 {                                                                     \
4538     intptr_t i = simd_oprsz(desc);                                    \
4539     uint64_t *g = vg;                                                 \
4540     do {                                                              \
4541         uint64_t pg = g[(i - 1) >> 6];                                \
4542         do {                                                          \
4543             i -= sizeof(TYPE);                                        \
4544             if (likely((pg >> (i & 63)) & 1)) {                       \
4545                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4546                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4547             }                                                         \
4548         } while (i & 63);                                             \
4549     } while (i != 0);                                                 \
4550 }
4551 
4552 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4553  * FZ16.  When converting from fp16, this affects flushing input denormals;
4554  * when converting to fp16, this affects flushing output denormals.
4555  */
4556 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4557 {
4558     bool save = get_flush_inputs_to_zero(fpst);
4559     float32 ret;
4560 
4561     set_flush_inputs_to_zero(false, fpst);
4562     ret = float16_to_float32(f, true, fpst);
4563     set_flush_inputs_to_zero(save, fpst);
4564     return ret;
4565 }
4566 
4567 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4568 {
4569     bool save = get_flush_inputs_to_zero(fpst);
4570     float64 ret;
4571 
4572     set_flush_inputs_to_zero(false, fpst);
4573     ret = float16_to_float64(f, true, fpst);
4574     set_flush_inputs_to_zero(save, fpst);
4575     return ret;
4576 }
4577 
4578 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4579 {
4580     bool save = get_flush_to_zero(fpst);
4581     float16 ret;
4582 
4583     set_flush_to_zero(false, fpst);
4584     ret = float32_to_float16(f, true, fpst);
4585     set_flush_to_zero(save, fpst);
4586     return ret;
4587 }
4588 
4589 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4590 {
4591     bool save = get_flush_to_zero(fpst);
4592     float16 ret;
4593 
4594     set_flush_to_zero(false, fpst);
4595     ret = float64_to_float16(f, true, fpst);
4596     set_flush_to_zero(save, fpst);
4597     return ret;
4598 }
4599 
4600 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4601 {
4602     if (float16_is_any_nan(f)) {
4603         float_raise(float_flag_invalid, s);
4604         return 0;
4605     }
4606     return float16_to_int16_round_to_zero(f, s);
4607 }
4608 
4609 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4610 {
4611     if (float16_is_any_nan(f)) {
4612         float_raise(float_flag_invalid, s);
4613         return 0;
4614     }
4615     return float16_to_int64_round_to_zero(f, s);
4616 }
4617 
4618 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4619 {
4620     if (float32_is_any_nan(f)) {
4621         float_raise(float_flag_invalid, s);
4622         return 0;
4623     }
4624     return float32_to_int64_round_to_zero(f, s);
4625 }
4626 
4627 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4628 {
4629     if (float64_is_any_nan(f)) {
4630         float_raise(float_flag_invalid, s);
4631         return 0;
4632     }
4633     return float64_to_int64_round_to_zero(f, s);
4634 }
4635 
4636 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4637 {
4638     if (float16_is_any_nan(f)) {
4639         float_raise(float_flag_invalid, s);
4640         return 0;
4641     }
4642     return float16_to_uint16_round_to_zero(f, s);
4643 }
4644 
4645 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4646 {
4647     if (float16_is_any_nan(f)) {
4648         float_raise(float_flag_invalid, s);
4649         return 0;
4650     }
4651     return float16_to_uint64_round_to_zero(f, s);
4652 }
4653 
4654 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4655 {
4656     if (float32_is_any_nan(f)) {
4657         float_raise(float_flag_invalid, s);
4658         return 0;
4659     }
4660     return float32_to_uint64_round_to_zero(f, s);
4661 }
4662 
4663 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4664 {
4665     if (float64_is_any_nan(f)) {
4666         float_raise(float_flag_invalid, s);
4667         return 0;
4668     }
4669     return float64_to_uint64_round_to_zero(f, s);
4670 }
4671 
4672 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4673 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4674 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4675 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4676 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4677 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4678 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4679 
4680 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4681 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4682 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4683 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4684 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4685 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4686 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4687 
4688 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4689 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4690 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4691 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4692 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4693 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4694 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4695 
4696 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4697 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4698 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4699 
4700 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4701 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4702 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4703 
4704 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4705 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4706 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4707 
4708 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4709 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4710 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4711 
4712 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4713 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4714 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4715 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4716 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4717 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4718 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4719 
4720 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4721 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4722 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4723 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4724 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4725 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4726 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4727 
4728 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4729 {
4730     /* Extract frac to the top of the uint32_t. */
4731     uint32_t frac = (uint32_t)a << (16 + 6);
4732     int16_t exp = extract32(a, 10, 5);
4733 
4734     if (unlikely(exp == 0)) {
4735         if (frac != 0) {
4736             if (!get_flush_inputs_to_zero(s)) {
4737                 /* denormal: bias - fractional_zeros */
4738                 return -15 - clz32(frac);
4739             }
4740             /* flush to zero */
4741             float_raise(float_flag_input_denormal_flushed, s);
4742         }
4743     } else if (unlikely(exp == 0x1f)) {
4744         if (frac == 0) {
4745             return INT16_MAX; /* infinity */
4746         }
4747     } else {
4748         /* normal: exp - bias */
4749         return exp - 15;
4750     }
4751     /* nan or zero */
4752     float_raise(float_flag_invalid, s);
4753     return INT16_MIN;
4754 }
4755 
4756 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4757 {
4758     /* Extract frac to the top of the uint32_t. */
4759     uint32_t frac = a << 9;
4760     int32_t exp = extract32(a, 23, 8);
4761 
4762     if (unlikely(exp == 0)) {
4763         if (frac != 0) {
4764             if (!get_flush_inputs_to_zero(s)) {
4765                 /* denormal: bias - fractional_zeros */
4766                 return -127 - clz32(frac);
4767             }
4768             /* flush to zero */
4769             float_raise(float_flag_input_denormal_flushed, s);
4770         }
4771     } else if (unlikely(exp == 0xff)) {
4772         if (frac == 0) {
4773             return INT32_MAX; /* infinity */
4774         }
4775     } else {
4776         /* normal: exp - bias */
4777         return exp - 127;
4778     }
4779     /* nan or zero */
4780     float_raise(float_flag_invalid, s);
4781     return INT32_MIN;
4782 }
4783 
4784 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4785 {
4786     /* Extract frac to the top of the uint64_t. */
4787     uint64_t frac = a << 12;
4788     int64_t exp = extract64(a, 52, 11);
4789 
4790     if (unlikely(exp == 0)) {
4791         if (frac != 0) {
4792             if (!get_flush_inputs_to_zero(s)) {
4793                 /* denormal: bias - fractional_zeros */
4794                 return -1023 - clz64(frac);
4795             }
4796             /* flush to zero */
4797             float_raise(float_flag_input_denormal_flushed, s);
4798         }
4799     } else if (unlikely(exp == 0x7ff)) {
4800         if (frac == 0) {
4801             return INT64_MAX; /* infinity */
4802         }
4803     } else {
4804         /* normal: exp - bias */
4805         return exp - 1023;
4806     }
4807     /* nan or zero */
4808     float_raise(float_flag_invalid, s);
4809     return INT64_MIN;
4810 }
4811 
4812 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4813 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4814 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4815 
4816 #undef DO_ZPZ_FP
4817 
4818 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4819                             float_status *status, uint32_t desc,
4820                             uint16_t neg1, uint16_t neg3, int flags)
4821 {
4822     intptr_t i = simd_oprsz(desc);
4823     uint64_t *g = vg;
4824 
4825     do {
4826         uint64_t pg = g[(i - 1) >> 6];
4827         do {
4828             i -= 2;
4829             if (likely((pg >> (i & 63)) & 1)) {
4830                 float16 e1, e2, e3, r;
4831 
4832                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4833                 e2 = *(uint16_t *)(vm + H1_2(i));
4834                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4835                 r = float16_muladd(e1, e2, e3, flags, status);
4836                 *(uint16_t *)(vd + H1_2(i)) = r;
4837             }
4838         } while (i & 63);
4839     } while (i != 0);
4840 }
4841 
4842 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4843                               void *vg, float_status *status, uint32_t desc)
4844 {
4845     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4846 }
4847 
4848 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4849                               void *vg, float_status *status, uint32_t desc)
4850 {
4851     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
4852 }
4853 
4854 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4855                                void *vg, float_status *status, uint32_t desc)
4856 {
4857     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
4858 }
4859 
4860 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4861                                void *vg, float_status *status, uint32_t desc)
4862 {
4863     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
4864 }
4865 
4866 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4867                               void *vg, float_status *status, uint32_t desc)
4868 {
4869     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4870                     float_muladd_negate_product);
4871 }
4872 
4873 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4874                                void *vg, float_status *status, uint32_t desc)
4875 {
4876     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4877                     float_muladd_negate_product | float_muladd_negate_c);
4878 }
4879 
4880 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4881                                void *vg, float_status *status, uint32_t desc)
4882 {
4883     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4884                     float_muladd_negate_c);
4885 }
4886 
4887 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4888                             float_status *status, uint32_t desc,
4889                             uint32_t neg1, uint32_t neg3, int flags)
4890 {
4891     intptr_t i = simd_oprsz(desc);
4892     uint64_t *g = vg;
4893 
4894     do {
4895         uint64_t pg = g[(i - 1) >> 6];
4896         do {
4897             i -= 4;
4898             if (likely((pg >> (i & 63)) & 1)) {
4899                 float32 e1, e2, e3, r;
4900 
4901                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4902                 e2 = *(uint32_t *)(vm + H1_4(i));
4903                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4904                 r = float32_muladd(e1, e2, e3, flags, status);
4905                 *(uint32_t *)(vd + H1_4(i)) = r;
4906             }
4907         } while (i & 63);
4908     } while (i != 0);
4909 }
4910 
4911 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4912                               void *vg, float_status *status, uint32_t desc)
4913 {
4914     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4915 }
4916 
4917 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4918                               void *vg, float_status *status, uint32_t desc)
4919 {
4920     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
4921 }
4922 
4923 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4924                                void *vg, float_status *status, uint32_t desc)
4925 {
4926     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
4927 }
4928 
4929 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4930                                void *vg, float_status *status, uint32_t desc)
4931 {
4932     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
4933 }
4934 
4935 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4936                               void *vg, float_status *status, uint32_t desc)
4937 {
4938     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4939                     float_muladd_negate_product);
4940 }
4941 
4942 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4943                                void *vg, float_status *status, uint32_t desc)
4944 {
4945     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4946                     float_muladd_negate_product | float_muladd_negate_c);
4947 }
4948 
4949 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4950                                void *vg, float_status *status, uint32_t desc)
4951 {
4952     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4953                     float_muladd_negate_c);
4954 }
4955 
4956 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4957                             float_status *status, uint32_t desc,
4958                             uint64_t neg1, uint64_t neg3, int flags)
4959 {
4960     intptr_t i = simd_oprsz(desc);
4961     uint64_t *g = vg;
4962 
4963     do {
4964         uint64_t pg = g[(i - 1) >> 6];
4965         do {
4966             i -= 8;
4967             if (likely((pg >> (i & 63)) & 1)) {
4968                 float64 e1, e2, e3, r;
4969 
4970                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4971                 e2 = *(uint64_t *)(vm + i);
4972                 e3 = *(uint64_t *)(va + i) ^ neg3;
4973                 r = float64_muladd(e1, e2, e3, flags, status);
4974                 *(uint64_t *)(vd + i) = r;
4975             }
4976         } while (i & 63);
4977     } while (i != 0);
4978 }
4979 
4980 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4981                               void *vg, float_status *status, uint32_t desc)
4982 {
4983     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4984 }
4985 
4986 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4987                               void *vg, float_status *status, uint32_t desc)
4988 {
4989     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
4990 }
4991 
4992 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4993                                void *vg, float_status *status, uint32_t desc)
4994 {
4995     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
4996 }
4997 
4998 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4999                                void *vg, float_status *status, uint32_t desc)
5000 {
5001     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
5002 }
5003 
5004 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5005                               void *vg, float_status *status, uint32_t desc)
5006 {
5007     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5008                     float_muladd_negate_product);
5009 }
5010 
5011 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5012                                void *vg, float_status *status, uint32_t desc)
5013 {
5014     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5015                     float_muladd_negate_product | float_muladd_negate_c);
5016 }
5017 
5018 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5019                                void *vg, float_status *status, uint32_t desc)
5020 {
5021     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5022                     float_muladd_negate_c);
5023 }
5024 
5025 /* Two operand floating-point comparison controlled by a predicate.
5026  * Unlike the integer version, we are not allowed to optimistically
5027  * compare operands, since the comparison may have side effects wrt
5028  * the FPSR.
5029  */
5030 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
5031 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
5032                   float_status *status, uint32_t desc)                  \
5033 {                                                                       \
5034     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
5035     uint64_t *d = vd, *g = vg;                                          \
5036     do {                                                                \
5037         uint64_t out = 0, pg = g[j];                                    \
5038         do {                                                            \
5039             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
5040             if (likely((pg >> (i & 63)) & 1)) {                         \
5041                 TYPE nn = *(TYPE *)(vn + H(i));                         \
5042                 TYPE mm = *(TYPE *)(vm + H(i));                         \
5043                 out |= OP(TYPE, nn, mm, status);                        \
5044             }                                                           \
5045         } while (i & 63);                                               \
5046         d[j--] = out;                                                   \
5047     } while (i > 0);                                                    \
5048 }
5049 
5050 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5051     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5052 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5053     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5054 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5055     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5056 
5057 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5058     DO_FPCMP_PPZZ_H(NAME, OP)   \
5059     DO_FPCMP_PPZZ_S(NAME, OP)   \
5060     DO_FPCMP_PPZZ_D(NAME, OP)
5061 
5062 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
5063 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
5064 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
5065 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
5066 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
5067 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
5068 #define DO_FCMUO(TYPE, X, Y, ST)  \
5069     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5070 #define DO_FACGE(TYPE, X, Y, ST)  \
5071     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5072 #define DO_FACGT(TYPE, X, Y, ST)  \
5073     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5074 
5075 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5076 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5077 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5078 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5079 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5080 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5081 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5082 
5083 #undef DO_FPCMP_PPZZ_ALL
5084 #undef DO_FPCMP_PPZZ_D
5085 #undef DO_FPCMP_PPZZ_S
5086 #undef DO_FPCMP_PPZZ_H
5087 #undef DO_FPCMP_PPZZ
5088 
5089 /* One operand floating-point comparison against zero, controlled
5090  * by a predicate.
5091  */
5092 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
5093 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
5094                   float_status *status, uint32_t desc)     \
5095 {                                                          \
5096     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
5097     uint64_t *d = vd, *g = vg;                             \
5098     do {                                                   \
5099         uint64_t out = 0, pg = g[j];                       \
5100         do {                                               \
5101             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
5102             if ((pg >> (i & 63)) & 1) {                    \
5103                 TYPE nn = *(TYPE *)(vn + H(i));            \
5104                 out |= OP(TYPE, nn, 0, status);            \
5105             }                                              \
5106         } while (i & 63);                                  \
5107         d[j--] = out;                                      \
5108     } while (i > 0);                                       \
5109 }
5110 
5111 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5112     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5113 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5114     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5115 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5116     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5117 
5118 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5119     DO_FPCMP_PPZ0_H(NAME, OP)   \
5120     DO_FPCMP_PPZ0_S(NAME, OP)   \
5121     DO_FPCMP_PPZ0_D(NAME, OP)
5122 
5123 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5124 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5125 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5126 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5127 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5128 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5129 
5130 /* FP Trig Multiply-Add. */
5131 
5132 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5133                          float_status *s, uint32_t desc)
5134 {
5135     static const float16 coeff[16] = {
5136         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5137         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5138     };
5139     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5140     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5141     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5142     float16 *d = vd, *n = vn, *m = vm;
5143 
5144     for (i = 0; i < opr_sz; i++) {
5145         float16 mm = m[i];
5146         intptr_t xx = x;
5147         int flags = 0;
5148 
5149         if (float16_is_neg(mm)) {
5150             if (fpcr_ah) {
5151                 flags = float_muladd_negate_product;
5152             } else {
5153                 mm = float16_abs(mm);
5154             }
5155             xx += 8;
5156         }
5157         d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5158     }
5159 }
5160 
5161 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5162                          float_status *s, uint32_t desc)
5163 {
5164     static const float32 coeff[16] = {
5165         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5166         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5167         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5168         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5169     };
5170     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5171     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5172     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5173     float32 *d = vd, *n = vn, *m = vm;
5174 
5175     for (i = 0; i < opr_sz; i++) {
5176         float32 mm = m[i];
5177         intptr_t xx = x;
5178         int flags = 0;
5179 
5180         if (float32_is_neg(mm)) {
5181             if (fpcr_ah) {
5182                 flags = float_muladd_negate_product;
5183             } else {
5184                 mm = float32_abs(mm);
5185             }
5186             xx += 8;
5187         }
5188         d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5189     }
5190 }
5191 
5192 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5193                          float_status *s, uint32_t desc)
5194 {
5195     static const float64 coeff[16] = {
5196         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5197         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5198         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5199         0x3de5d8408868552full, 0x0000000000000000ull,
5200         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5201         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5202         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5203         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5204     };
5205     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5206     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5207     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5208     float64 *d = vd, *n = vn, *m = vm;
5209 
5210     for (i = 0; i < opr_sz; i++) {
5211         float64 mm = m[i];
5212         intptr_t xx = x;
5213         int flags = 0;
5214 
5215         if (float64_is_neg(mm)) {
5216             if (fpcr_ah) {
5217                 flags = float_muladd_negate_product;
5218             } else {
5219                 mm = float64_abs(mm);
5220             }
5221             xx += 8;
5222         }
5223         d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5224     }
5225 }
5226 
5227 /*
5228  * FP Complex Add
5229  */
5230 
5231 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5232                          float_status *s, uint32_t desc)
5233 {
5234     intptr_t j, i = simd_oprsz(desc);
5235     uint64_t *g = vg;
5236     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5237     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5238 
5239     do {
5240         uint64_t pg = g[(i - 1) >> 6];
5241         do {
5242             float16 e0, e1, e2, e3;
5243 
5244             /* I holds the real index; J holds the imag index.  */
5245             j = i - sizeof(float16);
5246             i -= 2 * sizeof(float16);
5247 
5248             e0 = *(float16 *)(vn + H1_2(i));
5249             e1 = *(float16 *)(vm + H1_2(j));
5250             e2 = *(float16 *)(vn + H1_2(j));
5251             e3 = *(float16 *)(vm + H1_2(i));
5252 
5253             if (rot) {
5254                 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5255             } else {
5256                 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5257             }
5258 
5259             if (likely((pg >> (i & 63)) & 1)) {
5260                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5261             }
5262             if (likely((pg >> (j & 63)) & 1)) {
5263                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5264             }
5265         } while (i & 63);
5266     } while (i != 0);
5267 }
5268 
5269 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5270                          float_status *s, uint32_t desc)
5271 {
5272     intptr_t j, i = simd_oprsz(desc);
5273     uint64_t *g = vg;
5274     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5275     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5276 
5277     do {
5278         uint64_t pg = g[(i - 1) >> 6];
5279         do {
5280             float32 e0, e1, e2, e3;
5281 
5282             /* I holds the real index; J holds the imag index.  */
5283             j = i - sizeof(float32);
5284             i -= 2 * sizeof(float32);
5285 
5286             e0 = *(float32 *)(vn + H1_2(i));
5287             e1 = *(float32 *)(vm + H1_2(j));
5288             e2 = *(float32 *)(vn + H1_2(j));
5289             e3 = *(float32 *)(vm + H1_2(i));
5290 
5291             if (rot) {
5292                 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5293             } else {
5294                 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5295             }
5296 
5297             if (likely((pg >> (i & 63)) & 1)) {
5298                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5299             }
5300             if (likely((pg >> (j & 63)) & 1)) {
5301                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5302             }
5303         } while (i & 63);
5304     } while (i != 0);
5305 }
5306 
5307 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5308                          float_status *s, uint32_t desc)
5309 {
5310     intptr_t j, i = simd_oprsz(desc);
5311     uint64_t *g = vg;
5312     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5313     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5314 
5315     do {
5316         uint64_t pg = g[(i - 1) >> 6];
5317         do {
5318             float64 e0, e1, e2, e3;
5319 
5320             /* I holds the real index; J holds the imag index.  */
5321             j = i - sizeof(float64);
5322             i -= 2 * sizeof(float64);
5323 
5324             e0 = *(float64 *)(vn + H1_2(i));
5325             e1 = *(float64 *)(vm + H1_2(j));
5326             e2 = *(float64 *)(vn + H1_2(j));
5327             e3 = *(float64 *)(vm + H1_2(i));
5328 
5329             if (rot) {
5330                 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5331             } else {
5332                 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5333             }
5334 
5335             if (likely((pg >> (i & 63)) & 1)) {
5336                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5337             }
5338             if (likely((pg >> (j & 63)) & 1)) {
5339                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5340             }
5341         } while (i & 63);
5342     } while (i != 0);
5343 }
5344 
5345 /*
5346  * FP Complex Multiply
5347  */
5348 
5349 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5350                                void *vg, float_status *status, uint32_t desc)
5351 {
5352     intptr_t j, i = simd_oprsz(desc);
5353     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5354     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5355     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5356     uint32_t negf_real = flip ^ negf_imag;
5357     float16 negx_imag, negx_real;
5358     uint64_t *g = vg;
5359 
5360     /* With AH=0, use negx; with AH=1 use negf. */
5361     negx_real = (negf_real & ~fpcr_ah) << 15;
5362     negx_imag = (negf_imag & ~fpcr_ah) << 15;
5363     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5364     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5365 
5366     do {
5367         uint64_t pg = g[(i - 1) >> 6];
5368         do {
5369             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5370 
5371             /* I holds the real index; J holds the imag index.  */
5372             j = i - sizeof(float16);
5373             i -= 2 * sizeof(float16);
5374 
5375             nr = *(float16 *)(vn + H1_2(i));
5376             ni = *(float16 *)(vn + H1_2(j));
5377             mr = *(float16 *)(vm + H1_2(i));
5378             mi = *(float16 *)(vm + H1_2(j));
5379 
5380             e2 = (flip ? ni : nr);
5381             e1 = (flip ? mi : mr) ^ negx_real;
5382             e4 = e2;
5383             e3 = (flip ? mr : mi) ^ negx_imag;
5384 
5385             if (likely((pg >> (i & 63)) & 1)) {
5386                 d = *(float16 *)(va + H1_2(i));
5387                 d = float16_muladd(e2, e1, d, negf_real, status);
5388                 *(float16 *)(vd + H1_2(i)) = d;
5389             }
5390             if (likely((pg >> (j & 63)) & 1)) {
5391                 d = *(float16 *)(va + H1_2(j));
5392                 d = float16_muladd(e4, e3, d, negf_imag, status);
5393                 *(float16 *)(vd + H1_2(j)) = d;
5394             }
5395         } while (i & 63);
5396     } while (i != 0);
5397 }
5398 
5399 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5400                                void *vg, float_status *status, uint32_t desc)
5401 {
5402     intptr_t j, i = simd_oprsz(desc);
5403     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5404     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5405     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5406     uint32_t negf_real = flip ^ negf_imag;
5407     float32 negx_imag, negx_real;
5408     uint64_t *g = vg;
5409 
5410     /* With AH=0, use negx; with AH=1 use negf. */
5411     negx_real = (negf_real & ~fpcr_ah) << 31;
5412     negx_imag = (negf_imag & ~fpcr_ah) << 31;
5413     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5414     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5415 
5416     do {
5417         uint64_t pg = g[(i - 1) >> 6];
5418         do {
5419             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5420 
5421             /* I holds the real index; J holds the imag index.  */
5422             j = i - sizeof(float32);
5423             i -= 2 * sizeof(float32);
5424 
5425             nr = *(float32 *)(vn + H1_2(i));
5426             ni = *(float32 *)(vn + H1_2(j));
5427             mr = *(float32 *)(vm + H1_2(i));
5428             mi = *(float32 *)(vm + H1_2(j));
5429 
5430             e2 = (flip ? ni : nr);
5431             e1 = (flip ? mi : mr) ^ negx_real;
5432             e4 = e2;
5433             e3 = (flip ? mr : mi) ^ negx_imag;
5434 
5435             if (likely((pg >> (i & 63)) & 1)) {
5436                 d = *(float32 *)(va + H1_2(i));
5437                 d = float32_muladd(e2, e1, d, negf_real, status);
5438                 *(float32 *)(vd + H1_2(i)) = d;
5439             }
5440             if (likely((pg >> (j & 63)) & 1)) {
5441                 d = *(float32 *)(va + H1_2(j));
5442                 d = float32_muladd(e4, e3, d, negf_imag, status);
5443                 *(float32 *)(vd + H1_2(j)) = d;
5444             }
5445         } while (i & 63);
5446     } while (i != 0);
5447 }
5448 
5449 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5450                                void *vg, float_status *status, uint32_t desc)
5451 {
5452     intptr_t j, i = simd_oprsz(desc);
5453     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5454     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5455     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5456     uint32_t negf_real = flip ^ negf_imag;
5457     float64 negx_imag, negx_real;
5458     uint64_t *g = vg;
5459 
5460     /* With AH=0, use negx; with AH=1 use negf. */
5461     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5462     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5463     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5464     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5465 
5466     do {
5467         uint64_t pg = g[(i - 1) >> 6];
5468         do {
5469             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5470 
5471             /* I holds the real index; J holds the imag index.  */
5472             j = i - sizeof(float64);
5473             i -= 2 * sizeof(float64);
5474 
5475             nr = *(float64 *)(vn + H1_2(i));
5476             ni = *(float64 *)(vn + H1_2(j));
5477             mr = *(float64 *)(vm + H1_2(i));
5478             mi = *(float64 *)(vm + H1_2(j));
5479 
5480             e2 = (flip ? ni : nr);
5481             e1 = (flip ? mi : mr) ^ negx_real;
5482             e4 = e2;
5483             e3 = (flip ? mr : mi) ^ negx_imag;
5484 
5485             if (likely((pg >> (i & 63)) & 1)) {
5486                 d = *(float64 *)(va + H1_2(i));
5487                 d = float64_muladd(e2, e1, d, negf_real, status);
5488                 *(float64 *)(vd + H1_2(i)) = d;
5489             }
5490             if (likely((pg >> (j & 63)) & 1)) {
5491                 d = *(float64 *)(va + H1_2(j));
5492                 d = float64_muladd(e4, e3, d, negf_imag, status);
5493                 *(float64 *)(vd + H1_2(j)) = d;
5494             }
5495         } while (i & 63);
5496     } while (i != 0);
5497 }
5498 
5499 /*
5500  * Load contiguous data, protected by a governing predicate.
5501  */
5502 
5503 /*
5504  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5505  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5506  * element >= @reg_off, or @reg_max if there were no active elements at all.
5507  */
5508 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5509                                  intptr_t reg_max, int esz)
5510 {
5511     uint64_t pg_mask = pred_esz_masks[esz];
5512     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5513 
5514     /* In normal usage, the first element is active.  */
5515     if (likely(pg & 1)) {
5516         return reg_off;
5517     }
5518 
5519     if (pg == 0) {
5520         reg_off &= -64;
5521         do {
5522             reg_off += 64;
5523             if (unlikely(reg_off >= reg_max)) {
5524                 /* The entire predicate was false.  */
5525                 return reg_max;
5526             }
5527             pg = vg[reg_off >> 6] & pg_mask;
5528         } while (pg == 0);
5529     }
5530     reg_off += ctz64(pg);
5531 
5532     /* We should never see an out of range predicate bit set.  */
5533     tcg_debug_assert(reg_off < reg_max);
5534     return reg_off;
5535 }
5536 
5537 /*
5538  * Resolve the guest virtual address to info->host and info->flags.
5539  * If @nofault, return false if the page is invalid, otherwise
5540  * exit via page fault exception.
5541  */
5542 
5543 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5544                     target_ulong addr, int mem_off, MMUAccessType access_type,
5545                     int mmu_idx, uintptr_t retaddr)
5546 {
5547     int flags;
5548 
5549     addr += mem_off;
5550 
5551     /*
5552      * User-only currently always issues with TBI.  See the comment
5553      * above useronly_clean_ptr.  Usually we clean this top byte away
5554      * during translation, but we can't do that for e.g. vector + imm
5555      * addressing modes.
5556      *
5557      * We currently always enable TBI for user-only, and do not provide
5558      * a way to turn it off.  So clean the pointer unconditionally here,
5559      * rather than look it up here, or pass it down from above.
5560      */
5561     addr = useronly_clean_ptr(addr);
5562 
5563 #ifdef CONFIG_USER_ONLY
5564     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5565                                &info->host, retaddr);
5566 #else
5567     CPUTLBEntryFull *full;
5568     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5569                               &info->host, &full, retaddr);
5570 #endif
5571     info->flags = flags;
5572 
5573     if (flags & TLB_INVALID_MASK) {
5574         g_assert(nofault);
5575         return false;
5576     }
5577 
5578 #ifdef CONFIG_USER_ONLY
5579     memset(&info->attrs, 0, sizeof(info->attrs));
5580     /* Require both ANON and MTE; see allocation_tag_mem(). */
5581     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5582 #else
5583     info->attrs = full->attrs;
5584     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5585 #endif
5586 
5587     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5588     info->host -= mem_off;
5589     return true;
5590 }
5591 
5592 /*
5593  * Find first active element on each page, and a loose bound for the
5594  * final element on each page.  Identify any single element that spans
5595  * the page boundary.  Return true if there are any active elements.
5596  */
5597 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5598                             intptr_t reg_max, int esz, int msize)
5599 {
5600     const int esize = 1 << esz;
5601     const uint64_t pg_mask = pred_esz_masks[esz];
5602     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5603     intptr_t mem_off_last, mem_off_split;
5604     intptr_t page_split, elt_split;
5605     intptr_t i;
5606 
5607     /* Set all of the element indices to -1, and the TLB data to 0. */
5608     memset(info, -1, offsetof(SVEContLdSt, page));
5609     memset(info->page, 0, sizeof(info->page));
5610 
5611     /* Gross scan over the entire predicate to find bounds. */
5612     i = 0;
5613     do {
5614         uint64_t pg = vg[i] & pg_mask;
5615         if (pg) {
5616             reg_off_last = i * 64 + 63 - clz64(pg);
5617             if (reg_off_first < 0) {
5618                 reg_off_first = i * 64 + ctz64(pg);
5619             }
5620         }
5621     } while (++i * 64 < reg_max);
5622 
5623     if (unlikely(reg_off_first < 0)) {
5624         /* No active elements, no pages touched. */
5625         return false;
5626     }
5627     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5628 
5629     info->reg_off_first[0] = reg_off_first;
5630     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5631     mem_off_last = (reg_off_last >> esz) * msize;
5632 
5633     page_split = -(addr | TARGET_PAGE_MASK);
5634     if (likely(mem_off_last + msize <= page_split)) {
5635         /* The entire operation fits within a single page. */
5636         info->reg_off_last[0] = reg_off_last;
5637         return true;
5638     }
5639 
5640     info->page_split = page_split;
5641     elt_split = page_split / msize;
5642     reg_off_split = elt_split << esz;
5643     mem_off_split = elt_split * msize;
5644 
5645     /*
5646      * This is the last full element on the first page, but it is not
5647      * necessarily active.  If there is no full element, i.e. the first
5648      * active element is the one that's split, this value remains -1.
5649      * It is useful as iteration bounds.
5650      */
5651     if (elt_split != 0) {
5652         info->reg_off_last[0] = reg_off_split - esize;
5653     }
5654 
5655     /* Determine if an unaligned element spans the pages.  */
5656     if (page_split % msize != 0) {
5657         /* It is helpful to know if the split element is active. */
5658         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5659             info->reg_off_split = reg_off_split;
5660             info->mem_off_split = mem_off_split;
5661 
5662             if (reg_off_split == reg_off_last) {
5663                 /* The page crossing element is last. */
5664                 return true;
5665             }
5666         }
5667         reg_off_split += esize;
5668         mem_off_split += msize;
5669     }
5670 
5671     /*
5672      * We do want the first active element on the second page, because
5673      * this may affect the address reported in an exception.
5674      */
5675     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5676     tcg_debug_assert(reg_off_split <= reg_off_last);
5677     info->reg_off_first[1] = reg_off_split;
5678     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5679     info->reg_off_last[1] = reg_off_last;
5680     return true;
5681 }
5682 
5683 /*
5684  * Resolve the guest virtual addresses to info->page[].
5685  * Control the generation of page faults with @fault.  Return false if
5686  * there is no work to do, which can only happen with @fault == FAULT_NO.
5687  */
5688 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5689                          CPUARMState *env, target_ulong addr,
5690                          MMUAccessType access_type, uintptr_t retaddr)
5691 {
5692     int mmu_idx = arm_env_mmu_index(env);
5693     int mem_off = info->mem_off_first[0];
5694     bool nofault = fault == FAULT_NO;
5695     bool have_work = true;
5696 
5697     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5698                         access_type, mmu_idx, retaddr)) {
5699         /* No work to be done. */
5700         return false;
5701     }
5702 
5703     if (likely(info->page_split < 0)) {
5704         /* The entire operation was on the one page. */
5705         return true;
5706     }
5707 
5708     /*
5709      * If the second page is invalid, then we want the fault address to be
5710      * the first byte on that page which is accessed.
5711      */
5712     if (info->mem_off_split >= 0) {
5713         /*
5714          * There is an element split across the pages.  The fault address
5715          * should be the first byte of the second page.
5716          */
5717         mem_off = info->page_split;
5718         /*
5719          * If the split element is also the first active element
5720          * of the vector, then:  For first-fault we should continue
5721          * to generate faults for the second page.  For no-fault,
5722          * we have work only if the second page is valid.
5723          */
5724         if (info->mem_off_first[0] < info->mem_off_split) {
5725             nofault = FAULT_FIRST;
5726             have_work = false;
5727         }
5728     } else {
5729         /*
5730          * There is no element split across the pages.  The fault address
5731          * should be the first active element on the second page.
5732          */
5733         mem_off = info->mem_off_first[1];
5734         /*
5735          * There must have been one active element on the first page,
5736          * so we're out of first-fault territory.
5737          */
5738         nofault = fault != FAULT_ALL;
5739     }
5740 
5741     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5742                                 access_type, mmu_idx, retaddr);
5743     return have_work;
5744 }
5745 
5746 #ifndef CONFIG_USER_ONLY
5747 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5748                                uint64_t *vg, target_ulong addr,
5749                                int esize, int msize, int wp_access,
5750                                uintptr_t retaddr)
5751 {
5752     intptr_t mem_off, reg_off, reg_last;
5753     int flags0 = info->page[0].flags;
5754     int flags1 = info->page[1].flags;
5755 
5756     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5757         return;
5758     }
5759 
5760     /* Indicate that watchpoints are handled. */
5761     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5762     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5763 
5764     if (flags0 & TLB_WATCHPOINT) {
5765         mem_off = info->mem_off_first[0];
5766         reg_off = info->reg_off_first[0];
5767         reg_last = info->reg_off_last[0];
5768 
5769         while (reg_off <= reg_last) {
5770             uint64_t pg = vg[reg_off >> 6];
5771             do {
5772                 if ((pg >> (reg_off & 63)) & 1) {
5773                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5774                                          msize, info->page[0].attrs,
5775                                          wp_access, retaddr);
5776                 }
5777                 reg_off += esize;
5778                 mem_off += msize;
5779             } while (reg_off <= reg_last && (reg_off & 63));
5780         }
5781     }
5782 
5783     mem_off = info->mem_off_split;
5784     if (mem_off >= 0) {
5785         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5786                              info->page[0].attrs, wp_access, retaddr);
5787     }
5788 
5789     mem_off = info->mem_off_first[1];
5790     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5791         reg_off = info->reg_off_first[1];
5792         reg_last = info->reg_off_last[1];
5793 
5794         do {
5795             uint64_t pg = vg[reg_off >> 6];
5796             do {
5797                 if ((pg >> (reg_off & 63)) & 1) {
5798                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5799                                          msize, info->page[1].attrs,
5800                                          wp_access, retaddr);
5801                 }
5802                 reg_off += esize;
5803                 mem_off += msize;
5804             } while (reg_off & 63);
5805         } while (reg_off <= reg_last);
5806     }
5807 }
5808 #endif
5809 
5810 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5811                              uint64_t *vg, target_ulong addr, int esize,
5812                              int msize, uint32_t mtedesc, uintptr_t ra)
5813 {
5814     intptr_t mem_off, reg_off, reg_last;
5815 
5816     /* Process the page only if MemAttr == Tagged. */
5817     if (info->page[0].tagged) {
5818         mem_off = info->mem_off_first[0];
5819         reg_off = info->reg_off_first[0];
5820         reg_last = info->reg_off_split;
5821         if (reg_last < 0) {
5822             reg_last = info->reg_off_last[0];
5823         }
5824 
5825         do {
5826             uint64_t pg = vg[reg_off >> 6];
5827             do {
5828                 if ((pg >> (reg_off & 63)) & 1) {
5829                     mte_check(env, mtedesc, addr, ra);
5830                 }
5831                 reg_off += esize;
5832                 mem_off += msize;
5833             } while (reg_off <= reg_last && (reg_off & 63));
5834         } while (reg_off <= reg_last);
5835     }
5836 
5837     mem_off = info->mem_off_first[1];
5838     if (mem_off >= 0 && info->page[1].tagged) {
5839         reg_off = info->reg_off_first[1];
5840         reg_last = info->reg_off_last[1];
5841 
5842         do {
5843             uint64_t pg = vg[reg_off >> 6];
5844             do {
5845                 if ((pg >> (reg_off & 63)) & 1) {
5846                     mte_check(env, mtedesc, addr, ra);
5847                 }
5848                 reg_off += esize;
5849                 mem_off += msize;
5850             } while (reg_off & 63);
5851         } while (reg_off <= reg_last);
5852     }
5853 }
5854 
5855 /*
5856  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5857  */
5858 static inline QEMU_ALWAYS_INLINE
5859 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5860                uint32_t desc, const uintptr_t retaddr,
5861                const int esz, const int msz, const int N, uint32_t mtedesc,
5862                sve_ldst1_host_fn *host_fn,
5863                sve_ldst1_tlb_fn *tlb_fn)
5864 {
5865     const unsigned rd = simd_data(desc);
5866     const intptr_t reg_max = simd_oprsz(desc);
5867     intptr_t reg_off, reg_last, mem_off;
5868     SVEContLdSt info;
5869     void *host;
5870     int flags, i;
5871 
5872     /* Find the active elements.  */
5873     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5874         /* The entire predicate was false; no load occurs.  */
5875         for (i = 0; i < N; ++i) {
5876             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5877         }
5878         return;
5879     }
5880 
5881     /* Probe the page(s).  Exit with exception for any invalid page. */
5882     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5883 
5884     /* Handle watchpoints for all active elements. */
5885     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5886                               BP_MEM_READ, retaddr);
5887 
5888     /*
5889      * Handle mte checks for all active elements.
5890      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5891      */
5892     if (mtedesc) {
5893         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5894                                 mtedesc, retaddr);
5895     }
5896 
5897     flags = info.page[0].flags | info.page[1].flags;
5898     if (unlikely(flags != 0)) {
5899         /*
5900          * At least one page includes MMIO.
5901          * Any bus operation can fail with cpu_transaction_failed,
5902          * which for ARM will raise SyncExternal.  Perform the load
5903          * into scratch memory to preserve register state until the end.
5904          */
5905         ARMVectorReg scratch[4] = { };
5906 
5907         mem_off = info.mem_off_first[0];
5908         reg_off = info.reg_off_first[0];
5909         reg_last = info.reg_off_last[1];
5910         if (reg_last < 0) {
5911             reg_last = info.reg_off_split;
5912             if (reg_last < 0) {
5913                 reg_last = info.reg_off_last[0];
5914             }
5915         }
5916 
5917         do {
5918             uint64_t pg = vg[reg_off >> 6];
5919             do {
5920                 if ((pg >> (reg_off & 63)) & 1) {
5921                     for (i = 0; i < N; ++i) {
5922                         tlb_fn(env, &scratch[i], reg_off,
5923                                addr + mem_off + (i << msz), retaddr);
5924                     }
5925                 }
5926                 reg_off += 1 << esz;
5927                 mem_off += N << msz;
5928             } while (reg_off & 63);
5929         } while (reg_off <= reg_last);
5930 
5931         for (i = 0; i < N; ++i) {
5932             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5933         }
5934         return;
5935     }
5936 
5937     /* The entire operation is in RAM, on valid pages. */
5938 
5939     for (i = 0; i < N; ++i) {
5940         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5941     }
5942 
5943     mem_off = info.mem_off_first[0];
5944     reg_off = info.reg_off_first[0];
5945     reg_last = info.reg_off_last[0];
5946     host = info.page[0].host;
5947 
5948     set_helper_retaddr(retaddr);
5949 
5950     while (reg_off <= reg_last) {
5951         uint64_t pg = vg[reg_off >> 6];
5952         do {
5953             if ((pg >> (reg_off & 63)) & 1) {
5954                 for (i = 0; i < N; ++i) {
5955                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5956                             host + mem_off + (i << msz));
5957                 }
5958             }
5959             reg_off += 1 << esz;
5960             mem_off += N << msz;
5961         } while (reg_off <= reg_last && (reg_off & 63));
5962     }
5963 
5964     clear_helper_retaddr();
5965 
5966     /*
5967      * Use the slow path to manage the cross-page misalignment.
5968      * But we know this is RAM and cannot trap.
5969      */
5970     mem_off = info.mem_off_split;
5971     if (unlikely(mem_off >= 0)) {
5972         reg_off = info.reg_off_split;
5973         for (i = 0; i < N; ++i) {
5974             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5975                    addr + mem_off + (i << msz), retaddr);
5976         }
5977     }
5978 
5979     mem_off = info.mem_off_first[1];
5980     if (unlikely(mem_off >= 0)) {
5981         reg_off = info.reg_off_first[1];
5982         reg_last = info.reg_off_last[1];
5983         host = info.page[1].host;
5984 
5985         set_helper_retaddr(retaddr);
5986 
5987         do {
5988             uint64_t pg = vg[reg_off >> 6];
5989             do {
5990                 if ((pg >> (reg_off & 63)) & 1) {
5991                     for (i = 0; i < N; ++i) {
5992                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5993                                 host + mem_off + (i << msz));
5994                     }
5995                 }
5996                 reg_off += 1 << esz;
5997                 mem_off += N << msz;
5998             } while (reg_off & 63);
5999         } while (reg_off <= reg_last);
6000 
6001         clear_helper_retaddr();
6002     }
6003 }
6004 
6005 static inline QEMU_ALWAYS_INLINE
6006 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6007                    uint32_t desc, const uintptr_t ra,
6008                    const int esz, const int msz, const int N,
6009                    sve_ldst1_host_fn *host_fn,
6010                    sve_ldst1_tlb_fn *tlb_fn)
6011 {
6012     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6013     int bit55 = extract64(addr, 55, 1);
6014 
6015     /* Remove mtedesc from the normal sve descriptor. */
6016     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6017 
6018     /* Perform gross MTE suppression early. */
6019     if (!tbi_check(mtedesc, bit55) ||
6020         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6021         mtedesc = 0;
6022     }
6023 
6024     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6025 }
6026 
6027 #define DO_LD1_1(NAME, ESZ)                                             \
6028 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
6029                             target_ulong addr, uint32_t desc)           \
6030 {                                                                       \
6031     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
6032               sve_##NAME##_host, sve_##NAME##_tlb);                     \
6033 }                                                                       \
6034 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
6035                                 target_ulong addr, uint32_t desc)       \
6036 {                                                                       \
6037     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
6038                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
6039 }
6040 
6041 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
6042 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
6043                                target_ulong addr, uint32_t desc)        \
6044 {                                                                       \
6045     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6046               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
6047 }                                                                       \
6048 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
6049                                target_ulong addr, uint32_t desc)        \
6050 {                                                                       \
6051     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6052               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
6053 }                                                                       \
6054 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
6055                                    target_ulong addr, uint32_t desc)    \
6056 {                                                                       \
6057     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6058                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
6059 }                                                                       \
6060 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
6061                                    target_ulong addr, uint32_t desc)    \
6062 {                                                                       \
6063     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6064                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
6065 }
6066 
6067 DO_LD1_1(ld1bb,  MO_8)
6068 DO_LD1_1(ld1bhu, MO_16)
6069 DO_LD1_1(ld1bhs, MO_16)
6070 DO_LD1_1(ld1bsu, MO_32)
6071 DO_LD1_1(ld1bss, MO_32)
6072 DO_LD1_1(ld1bdu, MO_64)
6073 DO_LD1_1(ld1bds, MO_64)
6074 
6075 DO_LD1_2(ld1hh,  MO_16, MO_16)
6076 DO_LD1_2(ld1hsu, MO_32, MO_16)
6077 DO_LD1_2(ld1hss, MO_32, MO_16)
6078 DO_LD1_2(ld1hdu, MO_64, MO_16)
6079 DO_LD1_2(ld1hds, MO_64, MO_16)
6080 
6081 DO_LD1_2(ld1ss,  MO_32, MO_32)
6082 DO_LD1_2(ld1sdu, MO_64, MO_32)
6083 DO_LD1_2(ld1sds, MO_64, MO_32)
6084 
6085 DO_LD1_2(ld1dd,  MO_64, MO_64)
6086 
6087 #undef DO_LD1_1
6088 #undef DO_LD1_2
6089 
6090 #define DO_LDN_1(N)                                                     \
6091 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
6092                              target_ulong addr, uint32_t desc)          \
6093 {                                                                       \
6094     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
6095               sve_ld1bb_host, sve_ld1bb_tlb);                           \
6096 }                                                                       \
6097 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
6098                                  target_ulong addr, uint32_t desc)      \
6099 {                                                                       \
6100     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
6101                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
6102 }
6103 
6104 #define DO_LDN_2(N, SUFF, ESZ)                                          \
6105 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
6106                                     target_ulong addr, uint32_t desc)   \
6107 {                                                                       \
6108     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6109               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
6110 }                                                                       \
6111 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
6112                                     target_ulong addr, uint32_t desc)   \
6113 {                                                                       \
6114     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6115               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
6116 }                                                                       \
6117 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
6118                                         target_ulong addr, uint32_t desc) \
6119 {                                                                       \
6120     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6121                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
6122 }                                                                       \
6123 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
6124                                         target_ulong addr, uint32_t desc) \
6125 {                                                                       \
6126     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6127                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
6128 }
6129 
6130 DO_LDN_1(2)
6131 DO_LDN_1(3)
6132 DO_LDN_1(4)
6133 
6134 DO_LDN_2(2, hh, MO_16)
6135 DO_LDN_2(3, hh, MO_16)
6136 DO_LDN_2(4, hh, MO_16)
6137 
6138 DO_LDN_2(2, ss, MO_32)
6139 DO_LDN_2(3, ss, MO_32)
6140 DO_LDN_2(4, ss, MO_32)
6141 
6142 DO_LDN_2(2, dd, MO_64)
6143 DO_LDN_2(3, dd, MO_64)
6144 DO_LDN_2(4, dd, MO_64)
6145 
6146 #undef DO_LDN_1
6147 #undef DO_LDN_2
6148 
6149 /*
6150  * Load contiguous data, first-fault and no-fault.
6151  *
6152  * For user-only, we control the race between page_check_range and
6153  * another thread's munmap by using set/clear_helper_retaddr.  Any
6154  * SEGV that occurs between those markers is assumed to be because
6155  * the guest page vanished.  Keep that block as small as possible
6156  * so that unrelated QEMU bugs are not blamed on the guest.
6157  */
6158 
6159 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
6160  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6161  * option, which leaves subsequent data unchanged.
6162  */
6163 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6164 {
6165     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6166 
6167     if (i & 63) {
6168         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6169         i = ROUND_UP(i, 64);
6170     }
6171     for (; i < oprsz; i += 64) {
6172         ffr[i / 64] = 0;
6173     }
6174 }
6175 
6176 /*
6177  * Common helper for all contiguous no-fault and first-fault loads.
6178  */
6179 static inline QEMU_ALWAYS_INLINE
6180 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6181                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6182                    const int esz, const int msz, const SVEContFault fault,
6183                    sve_ldst1_host_fn *host_fn,
6184                    sve_ldst1_tlb_fn *tlb_fn)
6185 {
6186     const unsigned rd = simd_data(desc);
6187     void *vd = &env->vfp.zregs[rd];
6188     const intptr_t reg_max = simd_oprsz(desc);
6189     intptr_t reg_off, mem_off, reg_last;
6190     SVEContLdSt info;
6191     int flags;
6192     void *host;
6193 
6194     /* Find the active elements.  */
6195     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6196         /* The entire predicate was false; no load occurs.  */
6197         memset(vd, 0, reg_max);
6198         return;
6199     }
6200     reg_off = info.reg_off_first[0];
6201 
6202     /* Probe the page(s). */
6203     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6204         /* Fault on first element. */
6205         tcg_debug_assert(fault == FAULT_NO);
6206         memset(vd, 0, reg_max);
6207         goto do_fault;
6208     }
6209 
6210     mem_off = info.mem_off_first[0];
6211     flags = info.page[0].flags;
6212 
6213     /*
6214      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6215      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6216      */
6217     if (!info.page[0].tagged) {
6218         mtedesc = 0;
6219     }
6220 
6221     if (fault == FAULT_FIRST) {
6222         /* Trapping mte check for the first-fault element.  */
6223         if (mtedesc) {
6224             mte_check(env, mtedesc, addr + mem_off, retaddr);
6225         }
6226 
6227         /*
6228          * Special handling of the first active element,
6229          * if it crosses a page boundary or is MMIO.
6230          */
6231         bool is_split = mem_off == info.mem_off_split;
6232         if (unlikely(flags != 0) || unlikely(is_split)) {
6233             /*
6234              * Use the slow path for cross-page handling.
6235              * Might trap for MMIO or watchpoints.
6236              */
6237             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6238 
6239             /* After any fault, zero the other elements. */
6240             swap_memzero(vd, reg_off);
6241             reg_off += 1 << esz;
6242             mem_off += 1 << msz;
6243             swap_memzero(vd + reg_off, reg_max - reg_off);
6244 
6245             if (is_split) {
6246                 goto second_page;
6247             }
6248         } else {
6249             memset(vd, 0, reg_max);
6250         }
6251     } else {
6252         memset(vd, 0, reg_max);
6253         if (unlikely(mem_off == info.mem_off_split)) {
6254             /* The first active element crosses a page boundary. */
6255             flags |= info.page[1].flags;
6256             if (unlikely(flags & TLB_MMIO)) {
6257                 /* Some page is MMIO, see below. */
6258                 goto do_fault;
6259             }
6260             if (unlikely(flags & TLB_WATCHPOINT) &&
6261                 (cpu_watchpoint_address_matches
6262                  (env_cpu(env), addr + mem_off, 1 << msz)
6263                  & BP_MEM_READ)) {
6264                 /* Watchpoint hit, see below. */
6265                 goto do_fault;
6266             }
6267             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6268                 goto do_fault;
6269             }
6270             /*
6271              * Use the slow path for cross-page handling.
6272              * This is RAM, without a watchpoint, and will not trap.
6273              */
6274             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6275             goto second_page;
6276         }
6277     }
6278 
6279     /*
6280      * From this point on, all memory operations are MemSingleNF.
6281      *
6282      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6283      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6284      *
6285      * Unfortuately we do not have access to the memory attributes from the
6286      * PTE to tell Device memory from Normal memory.  So we make a mostly
6287      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6288      * This gives the right answer for the common cases of "Normal memory,
6289      * backed by host RAM" and "Device memory, backed by MMIO".
6290      * The architecture allows us to suppress an NF load and return
6291      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6292      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6293      * get wrong is "Device memory, backed by host RAM", for which we
6294      * should return (UNKNOWN, FAULT) for but do not.
6295      *
6296      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6297      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6298      * architectural breakpoints the same.
6299      */
6300     if (unlikely(flags & TLB_MMIO)) {
6301         goto do_fault;
6302     }
6303 
6304     reg_last = info.reg_off_last[0];
6305     host = info.page[0].host;
6306 
6307     set_helper_retaddr(retaddr);
6308 
6309     do {
6310         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6311         do {
6312             if ((pg >> (reg_off & 63)) & 1) {
6313                 if (unlikely(flags & TLB_WATCHPOINT) &&
6314                     (cpu_watchpoint_address_matches
6315                      (env_cpu(env), addr + mem_off, 1 << msz)
6316                      & BP_MEM_READ)) {
6317                     clear_helper_retaddr();
6318                     goto do_fault;
6319                 }
6320                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6321                     clear_helper_retaddr();
6322                     goto do_fault;
6323                 }
6324                 host_fn(vd, reg_off, host + mem_off);
6325             }
6326             reg_off += 1 << esz;
6327             mem_off += 1 << msz;
6328         } while (reg_off <= reg_last && (reg_off & 63));
6329     } while (reg_off <= reg_last);
6330 
6331     clear_helper_retaddr();
6332 
6333     /*
6334      * MemSingleNF is allowed to fail for any reason.  We have special
6335      * code above to handle the first element crossing a page boundary.
6336      * As an implementation choice, decline to handle a cross-page element
6337      * in any other position.
6338      */
6339     reg_off = info.reg_off_split;
6340     if (reg_off >= 0) {
6341         goto do_fault;
6342     }
6343 
6344  second_page:
6345     reg_off = info.reg_off_first[1];
6346     if (likely(reg_off < 0)) {
6347         /* No active elements on the second page.  All done. */
6348         return;
6349     }
6350 
6351     /*
6352      * MemSingleNF is allowed to fail for any reason.  As an implementation
6353      * choice, decline to handle elements on the second page.  This should
6354      * be low frequency as the guest walks through memory -- the next
6355      * iteration of the guest's loop should be aligned on the page boundary,
6356      * and then all following iterations will stay aligned.
6357      */
6358 
6359  do_fault:
6360     record_fault(env, reg_off, reg_max);
6361 }
6362 
6363 static inline QEMU_ALWAYS_INLINE
6364 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6365                        uint32_t desc, const uintptr_t retaddr,
6366                        const int esz, const int msz, const SVEContFault fault,
6367                        sve_ldst1_host_fn *host_fn,
6368                        sve_ldst1_tlb_fn *tlb_fn)
6369 {
6370     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6371     int bit55 = extract64(addr, 55, 1);
6372 
6373     /* Remove mtedesc from the normal sve descriptor. */
6374     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6375 
6376     /* Perform gross MTE suppression early. */
6377     if (!tbi_check(mtedesc, bit55) ||
6378         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6379         mtedesc = 0;
6380     }
6381 
6382     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6383                   esz, msz, fault, host_fn, tlb_fn);
6384 }
6385 
6386 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6387 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6388                                  target_ulong addr, uint32_t desc)      \
6389 {                                                                       \
6390     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6391                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6392 }                                                                       \
6393 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6394                                  target_ulong addr, uint32_t desc)      \
6395 {                                                                       \
6396     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6397                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6398 }                                                                       \
6399 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6400                                      target_ulong addr, uint32_t desc)  \
6401 {                                                                       \
6402     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6403                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6404 }                                                                       \
6405 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6406                                      target_ulong addr, uint32_t desc)  \
6407 {                                                                       \
6408     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6409                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6410 }
6411 
6412 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6413 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6414                                     target_ulong addr, uint32_t desc)   \
6415 {                                                                       \
6416     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6417                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6418 }                                                                       \
6419 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6420                                     target_ulong addr, uint32_t desc)   \
6421 {                                                                       \
6422     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6423                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6424 }                                                                       \
6425 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6426                                     target_ulong addr, uint32_t desc)   \
6427 {                                                                       \
6428     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6429                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6430 }                                                                       \
6431 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6432                                     target_ulong addr, uint32_t desc)   \
6433 {                                                                       \
6434     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6435                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6436 }                                                                       \
6437 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6438                                         target_ulong addr, uint32_t desc) \
6439 {                                                                       \
6440     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6441                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6442 }                                                                       \
6443 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6444                                         target_ulong addr, uint32_t desc) \
6445 {                                                                       \
6446     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6447                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6448 }                                                                       \
6449 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6450                                         target_ulong addr, uint32_t desc) \
6451 {                                                                       \
6452     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6453                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6454 }                                                                       \
6455 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6456                                         target_ulong addr, uint32_t desc) \
6457 {                                                                       \
6458     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6459                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6460 }
6461 
6462 DO_LDFF1_LDNF1_1(bb,  MO_8)
6463 DO_LDFF1_LDNF1_1(bhu, MO_16)
6464 DO_LDFF1_LDNF1_1(bhs, MO_16)
6465 DO_LDFF1_LDNF1_1(bsu, MO_32)
6466 DO_LDFF1_LDNF1_1(bss, MO_32)
6467 DO_LDFF1_LDNF1_1(bdu, MO_64)
6468 DO_LDFF1_LDNF1_1(bds, MO_64)
6469 
6470 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6471 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6472 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6473 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6474 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6475 
6476 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6477 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6478 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6479 
6480 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6481 
6482 #undef DO_LDFF1_LDNF1_1
6483 #undef DO_LDFF1_LDNF1_2
6484 
6485 /*
6486  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6487  */
6488 
6489 static inline QEMU_ALWAYS_INLINE
6490 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6491                uint32_t desc, const uintptr_t retaddr,
6492                const int esz, const int msz, const int N, uint32_t mtedesc,
6493                sve_ldst1_host_fn *host_fn,
6494                sve_ldst1_tlb_fn *tlb_fn)
6495 {
6496     const unsigned rd = simd_data(desc);
6497     const intptr_t reg_max = simd_oprsz(desc);
6498     intptr_t reg_off, reg_last, mem_off;
6499     SVEContLdSt info;
6500     void *host;
6501     int i, flags;
6502 
6503     /* Find the active elements.  */
6504     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6505         /* The entire predicate was false; no store occurs.  */
6506         return;
6507     }
6508 
6509     /* Probe the page(s).  Exit with exception for any invalid page. */
6510     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6511 
6512     /* Handle watchpoints for all active elements. */
6513     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6514                               BP_MEM_WRITE, retaddr);
6515 
6516     /*
6517      * Handle mte checks for all active elements.
6518      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6519      */
6520     if (mtedesc) {
6521         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6522                                 mtedesc, retaddr);
6523     }
6524 
6525     flags = info.page[0].flags | info.page[1].flags;
6526     if (unlikely(flags != 0)) {
6527         /*
6528          * At least one page includes MMIO.
6529          * Any bus operation can fail with cpu_transaction_failed,
6530          * which for ARM will raise SyncExternal.  We cannot avoid
6531          * this fault and will leave with the store incomplete.
6532          */
6533         mem_off = info.mem_off_first[0];
6534         reg_off = info.reg_off_first[0];
6535         reg_last = info.reg_off_last[1];
6536         if (reg_last < 0) {
6537             reg_last = info.reg_off_split;
6538             if (reg_last < 0) {
6539                 reg_last = info.reg_off_last[0];
6540             }
6541         }
6542 
6543         do {
6544             uint64_t pg = vg[reg_off >> 6];
6545             do {
6546                 if ((pg >> (reg_off & 63)) & 1) {
6547                     for (i = 0; i < N; ++i) {
6548                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6549                                addr + mem_off + (i << msz), retaddr);
6550                     }
6551                 }
6552                 reg_off += 1 << esz;
6553                 mem_off += N << msz;
6554             } while (reg_off & 63);
6555         } while (reg_off <= reg_last);
6556         return;
6557     }
6558 
6559     mem_off = info.mem_off_first[0];
6560     reg_off = info.reg_off_first[0];
6561     reg_last = info.reg_off_last[0];
6562     host = info.page[0].host;
6563 
6564     set_helper_retaddr(retaddr);
6565 
6566     while (reg_off <= reg_last) {
6567         uint64_t pg = vg[reg_off >> 6];
6568         do {
6569             if ((pg >> (reg_off & 63)) & 1) {
6570                 for (i = 0; i < N; ++i) {
6571                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6572                             host + mem_off + (i << msz));
6573                 }
6574             }
6575             reg_off += 1 << esz;
6576             mem_off += N << msz;
6577         } while (reg_off <= reg_last && (reg_off & 63));
6578     }
6579 
6580     clear_helper_retaddr();
6581 
6582     /*
6583      * Use the slow path to manage the cross-page misalignment.
6584      * But we know this is RAM and cannot trap.
6585      */
6586     mem_off = info.mem_off_split;
6587     if (unlikely(mem_off >= 0)) {
6588         reg_off = info.reg_off_split;
6589         for (i = 0; i < N; ++i) {
6590             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6591                    addr + mem_off + (i << msz), retaddr);
6592         }
6593     }
6594 
6595     mem_off = info.mem_off_first[1];
6596     if (unlikely(mem_off >= 0)) {
6597         reg_off = info.reg_off_first[1];
6598         reg_last = info.reg_off_last[1];
6599         host = info.page[1].host;
6600 
6601         set_helper_retaddr(retaddr);
6602 
6603         do {
6604             uint64_t pg = vg[reg_off >> 6];
6605             do {
6606                 if ((pg >> (reg_off & 63)) & 1) {
6607                     for (i = 0; i < N; ++i) {
6608                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6609                                 host + mem_off + (i << msz));
6610                     }
6611                 }
6612                 reg_off += 1 << esz;
6613                 mem_off += N << msz;
6614             } while (reg_off & 63);
6615         } while (reg_off <= reg_last);
6616 
6617         clear_helper_retaddr();
6618     }
6619 }
6620 
6621 static inline QEMU_ALWAYS_INLINE
6622 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6623                    uint32_t desc, const uintptr_t ra,
6624                    const int esz, const int msz, const int N,
6625                    sve_ldst1_host_fn *host_fn,
6626                    sve_ldst1_tlb_fn *tlb_fn)
6627 {
6628     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6629     int bit55 = extract64(addr, 55, 1);
6630 
6631     /* Remove mtedesc from the normal sve descriptor. */
6632     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6633 
6634     /* Perform gross MTE suppression early. */
6635     if (!tbi_check(mtedesc, bit55) ||
6636         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6637         mtedesc = 0;
6638     }
6639 
6640     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6641 }
6642 
6643 #define DO_STN_1(N, NAME, ESZ)                                          \
6644 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6645                                  target_ulong addr, uint32_t desc)      \
6646 {                                                                       \
6647     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6648               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6649 }                                                                       \
6650 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6651                                      target_ulong addr, uint32_t desc)  \
6652 {                                                                       \
6653     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6654                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6655 }
6656 
6657 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6658 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6659                                     target_ulong addr, uint32_t desc)   \
6660 {                                                                       \
6661     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6662               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6663 }                                                                       \
6664 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6665                                     target_ulong addr, uint32_t desc)   \
6666 {                                                                       \
6667     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6668               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6669 }                                                                       \
6670 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6671                                         target_ulong addr, uint32_t desc) \
6672 {                                                                       \
6673     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6674                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6675 }                                                                       \
6676 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6677                                         target_ulong addr, uint32_t desc) \
6678 {                                                                       \
6679     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6680                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6681 }
6682 
6683 DO_STN_1(1, bb, MO_8)
6684 DO_STN_1(1, bh, MO_16)
6685 DO_STN_1(1, bs, MO_32)
6686 DO_STN_1(1, bd, MO_64)
6687 DO_STN_1(2, bb, MO_8)
6688 DO_STN_1(3, bb, MO_8)
6689 DO_STN_1(4, bb, MO_8)
6690 
6691 DO_STN_2(1, hh, MO_16, MO_16)
6692 DO_STN_2(1, hs, MO_32, MO_16)
6693 DO_STN_2(1, hd, MO_64, MO_16)
6694 DO_STN_2(2, hh, MO_16, MO_16)
6695 DO_STN_2(3, hh, MO_16, MO_16)
6696 DO_STN_2(4, hh, MO_16, MO_16)
6697 
6698 DO_STN_2(1, ss, MO_32, MO_32)
6699 DO_STN_2(1, sd, MO_64, MO_32)
6700 DO_STN_2(2, ss, MO_32, MO_32)
6701 DO_STN_2(3, ss, MO_32, MO_32)
6702 DO_STN_2(4, ss, MO_32, MO_32)
6703 
6704 DO_STN_2(1, dd, MO_64, MO_64)
6705 DO_STN_2(2, dd, MO_64, MO_64)
6706 DO_STN_2(3, dd, MO_64, MO_64)
6707 DO_STN_2(4, dd, MO_64, MO_64)
6708 
6709 #undef DO_STN_1
6710 #undef DO_STN_2
6711 
6712 /*
6713  * Loads with a vector index.
6714  */
6715 
6716 /*
6717  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6718  */
6719 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6720 
6721 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6722 {
6723     return *(uint32_t *)(reg + H1_4(reg_ofs));
6724 }
6725 
6726 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6727 {
6728     return *(int32_t *)(reg + H1_4(reg_ofs));
6729 }
6730 
6731 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6732 {
6733     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6734 }
6735 
6736 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6737 {
6738     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6739 }
6740 
6741 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6742 {
6743     return *(uint64_t *)(reg + reg_ofs);
6744 }
6745 
6746 static inline QEMU_ALWAYS_INLINE
6747 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6748                target_ulong base, uint32_t desc, uintptr_t retaddr,
6749                uint32_t mtedesc, int esize, int msize,
6750                zreg_off_fn *off_fn,
6751                sve_ldst1_host_fn *host_fn,
6752                sve_ldst1_tlb_fn *tlb_fn)
6753 {
6754     const int mmu_idx = arm_env_mmu_index(env);
6755     const intptr_t reg_max = simd_oprsz(desc);
6756     const int scale = simd_data(desc);
6757     ARMVectorReg scratch;
6758     intptr_t reg_off;
6759     SVEHostPage info, info2;
6760 
6761     memset(&scratch, 0, reg_max);
6762     reg_off = 0;
6763     do {
6764         uint64_t pg = vg[reg_off >> 6];
6765         do {
6766             if (likely(pg & 1)) {
6767                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6768                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6769 
6770                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6771                                mmu_idx, retaddr);
6772 
6773                 if (likely(in_page >= msize)) {
6774                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6775                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6776                                              info.attrs, BP_MEM_READ, retaddr);
6777                     }
6778                     if (mtedesc && info.tagged) {
6779                         mte_check(env, mtedesc, addr, retaddr);
6780                     }
6781                     if (unlikely(info.flags & TLB_MMIO)) {
6782                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6783                     } else {
6784                         set_helper_retaddr(retaddr);
6785                         host_fn(&scratch, reg_off, info.host);
6786                         clear_helper_retaddr();
6787                     }
6788                 } else {
6789                     /* Element crosses the page boundary. */
6790                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6791                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6792                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6793                         cpu_check_watchpoint(env_cpu(env), addr,
6794                                              msize, info.attrs,
6795                                              BP_MEM_READ, retaddr);
6796                     }
6797                     if (mtedesc && info.tagged) {
6798                         mte_check(env, mtedesc, addr, retaddr);
6799                     }
6800                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6801                 }
6802             }
6803             reg_off += esize;
6804             pg >>= esize;
6805         } while (reg_off & 63);
6806     } while (reg_off < reg_max);
6807 
6808     /* Wait until all exceptions have been raised to write back.  */
6809     memcpy(vd, &scratch, reg_max);
6810 }
6811 
6812 static inline QEMU_ALWAYS_INLINE
6813 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6814                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6815                    int esize, int msize, zreg_off_fn *off_fn,
6816                    sve_ldst1_host_fn *host_fn,
6817                    sve_ldst1_tlb_fn *tlb_fn)
6818 {
6819     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6820     /* Remove mtedesc from the normal sve descriptor. */
6821     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6822 
6823     /*
6824      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6825      * offset base entirely over the address space hole to change the
6826      * pointer tag, or change the bit55 selector.  So we could here
6827      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6828      */
6829     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6830               esize, msize, off_fn, host_fn, tlb_fn);
6831 }
6832 
6833 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6834 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6835                                  void *vm, target_ulong base, uint32_t desc) \
6836 {                                                                            \
6837     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6838               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6839 }                                                                            \
6840 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6841      void *vm, target_ulong base, uint32_t desc)                             \
6842 {                                                                            \
6843     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6844                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6845 }
6846 
6847 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6848 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6849                                  void *vm, target_ulong base, uint32_t desc) \
6850 {                                                                            \
6851     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6852               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6853 }                                                                            \
6854 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6855     void *vm, target_ulong base, uint32_t desc)                              \
6856 {                                                                            \
6857     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6858                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6859 }
6860 
6861 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6862 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6863 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6864 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6865 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6866 
6867 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6868 DO_LD1_ZPZ_S(bss, zss, MO_8)
6869 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6870 DO_LD1_ZPZ_D(bds, zss, MO_8)
6871 DO_LD1_ZPZ_D(bds, zd, MO_8)
6872 
6873 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6874 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6875 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6876 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6877 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6878 
6879 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6880 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6881 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6882 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6883 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6884 
6885 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6886 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6887 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6888 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6889 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6890 
6891 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6892 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6893 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6894 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6895 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6896 
6897 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6898 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6899 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6900 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6901 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6902 
6903 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6904 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6905 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6906 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6907 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6908 
6909 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6910 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6911 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6912 
6913 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6914 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6915 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6916 
6917 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6918 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6919 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6920 
6921 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6922 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6923 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6924 
6925 #undef DO_LD1_ZPZ_S
6926 #undef DO_LD1_ZPZ_D
6927 
6928 /* First fault loads with a vector index.  */
6929 
6930 /*
6931  * Common helpers for all gather first-faulting loads.
6932  */
6933 
6934 static inline QEMU_ALWAYS_INLINE
6935 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6936                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6937                  uint32_t mtedesc, const int esz, const int msz,
6938                  zreg_off_fn *off_fn,
6939                  sve_ldst1_host_fn *host_fn,
6940                  sve_ldst1_tlb_fn *tlb_fn)
6941 {
6942     const int mmu_idx = arm_env_mmu_index(env);
6943     const intptr_t reg_max = simd_oprsz(desc);
6944     const int scale = simd_data(desc);
6945     const int esize = 1 << esz;
6946     const int msize = 1 << msz;
6947     intptr_t reg_off;
6948     SVEHostPage info;
6949     target_ulong addr, in_page;
6950     ARMVectorReg scratch;
6951 
6952     /* Skip to the first true predicate.  */
6953     reg_off = find_next_active(vg, 0, reg_max, esz);
6954     if (unlikely(reg_off >= reg_max)) {
6955         /* The entire predicate was false; no load occurs.  */
6956         memset(vd, 0, reg_max);
6957         return;
6958     }
6959 
6960     /* Protect against overlap between vd and vm. */
6961     if (unlikely(vd == vm)) {
6962         vm = memcpy(&scratch, vm, reg_max);
6963     }
6964 
6965     /*
6966      * Probe the first element, allowing faults.
6967      */
6968     addr = base + (off_fn(vm, reg_off) << scale);
6969     if (mtedesc) {
6970         mte_check(env, mtedesc, addr, retaddr);
6971     }
6972     tlb_fn(env, vd, reg_off, addr, retaddr);
6973 
6974     /* After any fault, zero the other elements. */
6975     swap_memzero(vd, reg_off);
6976     reg_off += esize;
6977     swap_memzero(vd + reg_off, reg_max - reg_off);
6978 
6979     /*
6980      * Probe the remaining elements, not allowing faults.
6981      */
6982     while (reg_off < reg_max) {
6983         uint64_t pg = vg[reg_off >> 6];
6984         do {
6985             if (likely((pg >> (reg_off & 63)) & 1)) {
6986                 addr = base + (off_fn(vm, reg_off) << scale);
6987                 in_page = -(addr | TARGET_PAGE_MASK);
6988 
6989                 if (unlikely(in_page < msize)) {
6990                     /* Stop if the element crosses a page boundary. */
6991                     goto fault;
6992                 }
6993 
6994                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6995                                mmu_idx, retaddr);
6996                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6997                     goto fault;
6998                 }
6999                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
7000                     (cpu_watchpoint_address_matches
7001                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
7002                     goto fault;
7003                 }
7004                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7005                     goto fault;
7006                 }
7007 
7008                 set_helper_retaddr(retaddr);
7009                 host_fn(vd, reg_off, info.host);
7010                 clear_helper_retaddr();
7011             }
7012             reg_off += esize;
7013         } while (reg_off & 63);
7014     }
7015     return;
7016 
7017  fault:
7018     record_fault(env, reg_off, reg_max);
7019 }
7020 
7021 static inline QEMU_ALWAYS_INLINE
7022 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7023                      target_ulong base, uint32_t desc, uintptr_t retaddr,
7024                      const int esz, const int msz,
7025                      zreg_off_fn *off_fn,
7026                      sve_ldst1_host_fn *host_fn,
7027                      sve_ldst1_tlb_fn *tlb_fn)
7028 {
7029     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7030     /* Remove mtedesc from the normal sve descriptor. */
7031     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7032 
7033     /*
7034      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7035      * offset base entirely over the address space hole to change the
7036      * pointer tag, or change the bit55 selector.  So we could here
7037      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7038      */
7039     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7040                 esz, msz, off_fn, host_fn, tlb_fn);
7041 }
7042 
7043 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
7044 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7045     (CPUARMState *env, void *vd, void *vg,                              \
7046      void *vm, target_ulong base, uint32_t desc)                        \
7047 {                                                                       \
7048     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
7049                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7050 }                                                                       \
7051 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7052     (CPUARMState *env, void *vd, void *vg,                              \
7053      void *vm, target_ulong base, uint32_t desc)                        \
7054 {                                                                       \
7055     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
7056                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7057 }
7058 
7059 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
7060 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7061     (CPUARMState *env, void *vd, void *vg,                              \
7062      void *vm, target_ulong base, uint32_t desc)                        \
7063 {                                                                       \
7064     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
7065                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7066 }                                                                       \
7067 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7068     (CPUARMState *env, void *vd, void *vg,                              \
7069      void *vm, target_ulong base, uint32_t desc)                        \
7070 {                                                                       \
7071     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
7072                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7073 }
7074 
7075 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7076 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7077 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7078 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7079 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7080 
7081 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7082 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7083 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7084 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7085 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7086 
7087 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7088 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7089 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7090 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7091 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7092 
7093 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7094 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7095 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7096 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7097 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7098 
7099 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7100 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7101 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7102 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7103 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7104 
7105 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7106 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7107 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7108 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7109 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7110 
7111 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
7112 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
7113 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7114 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7115 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7116 
7117 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
7118 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
7119 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7120 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7121 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7122 
7123 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7124 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7125 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7126 
7127 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7128 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7129 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7130 
7131 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7132 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7133 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7134 
7135 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7136 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7137 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7138 
7139 /* Stores with a vector index.  */
7140 
7141 static inline QEMU_ALWAYS_INLINE
7142 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7143                target_ulong base, uint32_t desc, uintptr_t retaddr,
7144                uint32_t mtedesc, int esize, int msize,
7145                zreg_off_fn *off_fn,
7146                sve_ldst1_host_fn *host_fn,
7147                sve_ldst1_tlb_fn *tlb_fn)
7148 {
7149     const int mmu_idx = arm_env_mmu_index(env);
7150     const intptr_t reg_max = simd_oprsz(desc);
7151     const int scale = simd_data(desc);
7152     void *host[ARM_MAX_VQ * 4];
7153     intptr_t reg_off, i;
7154     SVEHostPage info, info2;
7155 
7156     /*
7157      * Probe all of the elements for host addresses and flags.
7158      */
7159     i = reg_off = 0;
7160     do {
7161         uint64_t pg = vg[reg_off >> 6];
7162         do {
7163             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7164             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7165 
7166             host[i] = NULL;
7167             if (likely((pg >> (reg_off & 63)) & 1)) {
7168                 if (likely(in_page >= msize)) {
7169                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7170                                    mmu_idx, retaddr);
7171                     if (!(info.flags & TLB_MMIO)) {
7172                         host[i] = info.host;
7173                     }
7174                 } else {
7175                     /*
7176                      * Element crosses the page boundary.
7177                      * Probe both pages, but do not record the host address,
7178                      * so that we use the slow path.
7179                      */
7180                     sve_probe_page(&info, false, env, addr, 0,
7181                                    MMU_DATA_STORE, mmu_idx, retaddr);
7182                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7183                                    MMU_DATA_STORE, mmu_idx, retaddr);
7184                     info.flags |= info2.flags;
7185                 }
7186 
7187                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7188                     cpu_check_watchpoint(env_cpu(env), addr, msize,
7189                                          info.attrs, BP_MEM_WRITE, retaddr);
7190                 }
7191 
7192                 if (mtedesc && info.tagged) {
7193                     mte_check(env, mtedesc, addr, retaddr);
7194                 }
7195             }
7196             i += 1;
7197             reg_off += esize;
7198         } while (reg_off & 63);
7199     } while (reg_off < reg_max);
7200 
7201     /*
7202      * Now that we have recognized all exceptions except SyncExternal
7203      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7204      *
7205      * Note for the common case of an element in RAM, not crossing a page
7206      * boundary, we have stored the host address in host[].  This doubles
7207      * as a first-level check against the predicate, since only enabled
7208      * elements have non-null host addresses.
7209      */
7210     i = reg_off = 0;
7211     do {
7212         void *h = host[i];
7213         if (likely(h != NULL)) {
7214             set_helper_retaddr(retaddr);
7215             host_fn(vd, reg_off, h);
7216             clear_helper_retaddr();
7217         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7218             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7219             tlb_fn(env, vd, reg_off, addr, retaddr);
7220         }
7221         i += 1;
7222         reg_off += esize;
7223     } while (reg_off < reg_max);
7224 }
7225 
7226 static inline QEMU_ALWAYS_INLINE
7227 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7228                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7229                    int esize, int msize, zreg_off_fn *off_fn,
7230                    sve_ldst1_host_fn *host_fn,
7231                    sve_ldst1_tlb_fn *tlb_fn)
7232 {
7233     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7234     /* Remove mtedesc from the normal sve descriptor. */
7235     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7236 
7237     /*
7238      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7239      * offset base entirely over the address space hole to change the
7240      * pointer tag, or change the bit55 selector.  So we could here
7241      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7242      */
7243     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7244               esize, msize, off_fn, host_fn, tlb_fn);
7245 }
7246 
7247 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7248 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7249                                  void *vm, target_ulong base, uint32_t desc) \
7250 {                                                                       \
7251     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7252               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7253 }                                                                       \
7254 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7255     void *vm, target_ulong base, uint32_t desc)                         \
7256 {                                                                       \
7257     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7258                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7259 }
7260 
7261 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7262 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7263                                  void *vm, target_ulong base, uint32_t desc) \
7264 {                                                                       \
7265     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7266               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7267 }                                                                       \
7268 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7269     void *vm, target_ulong base, uint32_t desc)                         \
7270 {                                                                       \
7271     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7272                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7273 }
7274 
7275 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7276 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7277 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7278 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7279 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7280 
7281 DO_ST1_ZPZ_S(bs, zss, MO_8)
7282 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7283 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7284 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7285 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7286 
7287 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7288 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7289 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7290 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7291 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7292 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7293 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7294 
7295 DO_ST1_ZPZ_D(bd, zss, MO_8)
7296 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7297 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7298 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7299 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7300 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7301 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7302 
7303 DO_ST1_ZPZ_D(bd, zd, MO_8)
7304 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7305 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7306 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7307 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7308 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7309 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7310 
7311 #undef DO_ST1_ZPZ_S
7312 #undef DO_ST1_ZPZ_D
7313 
7314 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7315 {
7316     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7317     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7318 
7319     for (i = 0; i < opr_sz; ++i) {
7320         d[i] = n[i] ^ m[i] ^ k[i];
7321     }
7322 }
7323 
7324 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7325 {
7326     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7327     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7328 
7329     for (i = 0; i < opr_sz; ++i) {
7330         d[i] = n[i] ^ (m[i] & ~k[i]);
7331     }
7332 }
7333 
7334 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7335 {
7336     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7337     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7338 
7339     for (i = 0; i < opr_sz; ++i) {
7340         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7341     }
7342 }
7343 
7344 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7345 {
7346     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7347     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7348 
7349     for (i = 0; i < opr_sz; ++i) {
7350         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7351     }
7352 }
7353 
7354 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7355 {
7356     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7357     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7358 
7359     for (i = 0; i < opr_sz; ++i) {
7360         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7361     }
7362 }
7363 
7364 /*
7365  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7366  * See hasless(v,1) from
7367  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7368  */
7369 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7370 {
7371     int bits = 8 << esz;
7372     uint64_t ones = dup_const(esz, 1);
7373     uint64_t signs = ones << (bits - 1);
7374     uint64_t cmp0, cmp1;
7375 
7376     cmp1 = dup_const(esz, n);
7377     cmp0 = cmp1 ^ m0;
7378     cmp1 = cmp1 ^ m1;
7379     cmp0 = (cmp0 - ones) & ~cmp0;
7380     cmp1 = (cmp1 - ones) & ~cmp1;
7381     return (cmp0 | cmp1) & signs;
7382 }
7383 
7384 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7385                                 uint32_t desc, int esz, bool nmatch)
7386 {
7387     uint16_t esz_mask = pred_esz_masks[esz];
7388     intptr_t opr_sz = simd_oprsz(desc);
7389     uint32_t flags = PREDTEST_INIT;
7390     intptr_t i, j, k;
7391 
7392     for (i = 0; i < opr_sz; i += 16) {
7393         uint64_t m0 = *(uint64_t *)(vm + i);
7394         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7395         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7396         uint16_t out = 0;
7397 
7398         for (j = 0; j < 16; j += 8) {
7399             uint64_t n = *(uint64_t *)(vn + i + j);
7400 
7401             for (k = 0; k < 8; k += 1 << esz) {
7402                 if (pg & (1 << (j + k))) {
7403                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7404                     out |= (o ^ nmatch) << (j + k);
7405                 }
7406             }
7407         }
7408         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7409         flags = iter_predtest_fwd(out, pg, flags);
7410     }
7411     return flags;
7412 }
7413 
7414 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7415 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7416 {                                                                             \
7417     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7418 }
7419 
7420 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7421 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7422 
7423 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7424 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7425 
7426 #undef DO_PPZZ_MATCH
7427 
7428 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7429                             uint32_t desc)
7430 {
7431     ARMVectorReg scratch;
7432     intptr_t i, j;
7433     intptr_t opr_sz = simd_oprsz(desc);
7434     uint32_t *d = vd, *n = vn, *m = vm;
7435     uint8_t *pg = vg;
7436 
7437     if (d == n) {
7438         n = memcpy(&scratch, n, opr_sz);
7439         if (d == m) {
7440             m = n;
7441         }
7442     } else if (d == m) {
7443         m = memcpy(&scratch, m, opr_sz);
7444     }
7445 
7446     for (i = 0; i < opr_sz; i += 4) {
7447         uint64_t count = 0;
7448         uint8_t pred;
7449 
7450         pred = pg[H1(i >> 3)] >> (i & 7);
7451         if (pred & 1) {
7452             uint32_t nn = n[H4(i >> 2)];
7453 
7454             for (j = 0; j <= i; j += 4) {
7455                 pred = pg[H1(j >> 3)] >> (j & 7);
7456                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7457                     ++count;
7458                 }
7459             }
7460         }
7461         d[H4(i >> 2)] = count;
7462     }
7463 }
7464 
7465 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7466                             uint32_t desc)
7467 {
7468     ARMVectorReg scratch;
7469     intptr_t i, j;
7470     intptr_t opr_sz = simd_oprsz(desc);
7471     uint64_t *d = vd, *n = vn, *m = vm;
7472     uint8_t *pg = vg;
7473 
7474     if (d == n) {
7475         n = memcpy(&scratch, n, opr_sz);
7476         if (d == m) {
7477             m = n;
7478         }
7479     } else if (d == m) {
7480         m = memcpy(&scratch, m, opr_sz);
7481     }
7482 
7483     for (i = 0; i < opr_sz / 8; ++i) {
7484         uint64_t count = 0;
7485         if (pg[H1(i)] & 1) {
7486             uint64_t nn = n[i];
7487             for (j = 0; j <= i; ++j) {
7488                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7489                     ++count;
7490                 }
7491             }
7492         }
7493         d[i] = count;
7494     }
7495 }
7496 
7497 /*
7498  * Returns the number of bytes in m0 and m1 that match n.
7499  * Unlike do_match2 we don't just need true/false, we need an exact count.
7500  * This requires two extra logical operations.
7501  */
7502 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7503 {
7504     const uint64_t mask = dup_const(MO_8, 0x7f);
7505     uint64_t cmp0, cmp1;
7506 
7507     cmp1 = dup_const(MO_8, n);
7508     cmp0 = cmp1 ^ m0;
7509     cmp1 = cmp1 ^ m1;
7510 
7511     /*
7512      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7513      * 2: carry in to msb if byte != 0 (+ mask)
7514      * 3: set msb if cmp has msb set (| cmp)
7515      * 4: set ~msb to ignore them (| mask)
7516      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7517      * 5: invert, resulting in 0x80 if and only if byte == 0.
7518      */
7519     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7520     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7521 
7522     /*
7523      * Combine the two compares in a way that the bits do
7524      * not overlap, and so preserves the count of set bits.
7525      * If the host has an efficient instruction for ctpop,
7526      * then ctpop(x) + ctpop(y) has the same number of
7527      * operations as ctpop(x | (y >> 1)).  If the host does
7528      * not have an efficient ctpop, then we only want to
7529      * use it once.
7530      */
7531     return ctpop64(cmp0 | (cmp1 >> 1));
7532 }
7533 
7534 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7535 {
7536     intptr_t i, j;
7537     intptr_t opr_sz = simd_oprsz(desc);
7538 
7539     for (i = 0; i < opr_sz; i += 16) {
7540         uint64_t n0 = *(uint64_t *)(vn + i);
7541         uint64_t m0 = *(uint64_t *)(vm + i);
7542         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7543         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7544         uint64_t out0 = 0;
7545         uint64_t out1 = 0;
7546 
7547         for (j = 0; j < 64; j += 8) {
7548             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7549             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7550             out0 |= cnt0 << j;
7551             out1 |= cnt1 << j;
7552         }
7553 
7554         *(uint64_t *)(vd + i) = out0;
7555         *(uint64_t *)(vd + i + 8) = out1;
7556     }
7557 }
7558 
7559 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7560 {
7561     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7562     int shr = simd_data(desc);
7563     int shl = 8 - shr;
7564     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7565     uint64_t *d = vd, *n = vn, *m = vm;
7566 
7567     for (i = 0; i < opr_sz; ++i) {
7568         uint64_t t = n[i] ^ m[i];
7569         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7570     }
7571 }
7572 
7573 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7574 {
7575     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7576     int shr = simd_data(desc);
7577     int shl = 16 - shr;
7578     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7579     uint64_t *d = vd, *n = vn, *m = vm;
7580 
7581     for (i = 0; i < opr_sz; ++i) {
7582         uint64_t t = n[i] ^ m[i];
7583         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7584     }
7585 }
7586 
7587 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7588 {
7589     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7590     int shr = simd_data(desc);
7591     uint32_t *d = vd, *n = vn, *m = vm;
7592 
7593     for (i = 0; i < opr_sz; ++i) {
7594         d[i] = ror32(n[i] ^ m[i], shr);
7595     }
7596 }
7597 
7598 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7599                      float_status *status, uint32_t desc)
7600 {
7601     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7602 
7603     for (s = 0; s < opr_sz; ++s) {
7604         float32 *n = vn + s * sizeof(float32) * 4;
7605         float32 *m = vm + s * sizeof(float32) * 4;
7606         float32 *a = va + s * sizeof(float32) * 4;
7607         float32 *d = vd + s * sizeof(float32) * 4;
7608         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7609         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7610         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7611         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7612         float32 p0, p1;
7613 
7614         /* i = 0, j = 0 */
7615         p0 = float32_mul(n00, m00, status);
7616         p1 = float32_mul(n01, m01, status);
7617         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7618 
7619         /* i = 0, j = 1 */
7620         p0 = float32_mul(n00, m10, status);
7621         p1 = float32_mul(n01, m11, status);
7622         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7623 
7624         /* i = 1, j = 0 */
7625         p0 = float32_mul(n10, m00, status);
7626         p1 = float32_mul(n11, m01, status);
7627         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7628 
7629         /* i = 1, j = 1 */
7630         p0 = float32_mul(n10, m10, status);
7631         p1 = float32_mul(n11, m11, status);
7632         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7633     }
7634 }
7635 
7636 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7637                      float_status *status, uint32_t desc)
7638 {
7639     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7640 
7641     for (s = 0; s < opr_sz; ++s) {
7642         float64 *n = vn + s * sizeof(float64) * 4;
7643         float64 *m = vm + s * sizeof(float64) * 4;
7644         float64 *a = va + s * sizeof(float64) * 4;
7645         float64 *d = vd + s * sizeof(float64) * 4;
7646         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7647         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7648         float64 p0, p1;
7649 
7650         /* i = 0, j = 0 */
7651         p0 = float64_mul(n00, m00, status);
7652         p1 = float64_mul(n01, m01, status);
7653         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7654 
7655         /* i = 0, j = 1 */
7656         p0 = float64_mul(n00, m10, status);
7657         p1 = float64_mul(n01, m11, status);
7658         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7659 
7660         /* i = 1, j = 0 */
7661         p0 = float64_mul(n10, m00, status);
7662         p1 = float64_mul(n11, m01, status);
7663         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7664 
7665         /* i = 1, j = 1 */
7666         p0 = float64_mul(n10, m10, status);
7667         p1 = float64_mul(n11, m11, status);
7668         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7669     }
7670 }
7671 
7672 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7673 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7674                   float_status *status, uint32_t desc)                        \
7675 {                                                                             \
7676     intptr_t i = simd_oprsz(desc);                                            \
7677     uint64_t *g = vg;                                                         \
7678     do {                                                                      \
7679         uint64_t pg = g[(i - 1) >> 6];                                        \
7680         do {                                                                  \
7681             i -= sizeof(TYPEW);                                               \
7682             if (likely((pg >> (i & 63)) & 1)) {                               \
7683                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7684                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7685             }                                                                 \
7686         } while (i & 63);                                                     \
7687     } while (i != 0);                                                         \
7688 }
7689 
7690 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7691 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7692 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7693 
7694 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7695 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7696                   float_status *status, uint32_t desc)                        \
7697 {                                                                             \
7698     intptr_t i = simd_oprsz(desc);                                            \
7699     uint64_t *g = vg;                                                         \
7700     do {                                                                      \
7701         uint64_t pg = g[(i - 1) >> 6];                                        \
7702         do {                                                                  \
7703             i -= sizeof(TYPEW);                                               \
7704             if (likely((pg >> (i & 63)) & 1)) {                               \
7705                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7706                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7707             }                                                                 \
7708         } while (i & 63);                                                     \
7709     } while (i != 0);                                                         \
7710 }
7711 
7712 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7713 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7714 
7715 #undef DO_FCVTLT
7716 #undef DO_FCVTNT
7717