xref: /qemu/target/arm/tcg/sve_helper.c (revision 513823e7521a09ed7ad1e32e6454bac3b2cbf52d)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
31 #include "hw/core/tcg-cpu-ops.h"
32 #ifdef CONFIG_USER_ONLY
33 #include "user/page-protection.h"
34 #endif
35 
36 
37 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
38  *
39  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
40  * and bit 0 set if C is set.  Compare the definitions of these variables
41  * within CPUARMState.
42  */
43 
44 /* For no G bits set, NZCV = C.  */
45 #define PREDTEST_INIT  1
46 
47 /* This is an iterative function, called for each Pd and Pg word
48  * moving forward.
49  */
50 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
51 {
52     if (likely(g)) {
53         /* Compute N from first D & G.
54            Use bit 2 to signal first G bit seen.  */
55         if (!(flags & 4)) {
56             flags |= ((d & (g & -g)) != 0) << 31;
57             flags |= 4;
58         }
59 
60         /* Accumulate Z from each D & G.  */
61         flags |= ((d & g) != 0) << 1;
62 
63         /* Compute C from last !(D & G).  Replace previous.  */
64         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
65     }
66     return flags;
67 }
68 
69 /* This is an iterative function, called for each Pd and Pg word
70  * moving backward.
71  */
72 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
73 {
74     if (likely(g)) {
75         /* Compute C from first (i.e last) !(D & G).
76            Use bit 2 to signal first G bit seen.  */
77         if (!(flags & 4)) {
78             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
79             flags |= (d & pow2floor(g)) == 0;
80         }
81 
82         /* Accumulate Z from each D & G.  */
83         flags |= ((d & g) != 0) << 1;
84 
85         /* Compute N from last (i.e first) D & G.  Replace previous.  */
86         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
87     }
88     return flags;
89 }
90 
91 /* The same for a single word predicate.  */
92 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
93 {
94     return iter_predtest_fwd(d, g, PREDTEST_INIT);
95 }
96 
97 /* The same for a multi-word predicate.  */
98 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
99 {
100     uint32_t flags = PREDTEST_INIT;
101     uint64_t *d = vd, *g = vg;
102     uintptr_t i = 0;
103 
104     do {
105         flags = iter_predtest_fwd(d[i], g[i], flags);
106     } while (++i < words);
107 
108     return flags;
109 }
110 
111 /* Similarly for single word elements.  */
112 static inline uint64_t expand_pred_s(uint8_t byte)
113 {
114     static const uint64_t word[] = {
115         [0x01] = 0x00000000ffffffffull,
116         [0x10] = 0xffffffff00000000ull,
117         [0x11] = 0xffffffffffffffffull,
118     };
119     return word[byte & 0x11];
120 }
121 
122 #define LOGICAL_PPPP(NAME, FUNC) \
123 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
124 {                                                                         \
125     uintptr_t opr_sz = simd_oprsz(desc);                                  \
126     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
127     uintptr_t i;                                                          \
128     for (i = 0; i < opr_sz / 8; ++i) {                                    \
129         d[i] = FUNC(n[i], m[i], g[i]);                                    \
130     }                                                                     \
131 }
132 
133 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
134 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
135 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
136 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
137 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
138 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
139 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
140 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
141 
142 LOGICAL_PPPP(sve_and_pppp, DO_AND)
143 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
144 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
145 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
146 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
147 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
148 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
149 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
150 
151 #undef DO_AND
152 #undef DO_BIC
153 #undef DO_EOR
154 #undef DO_ORR
155 #undef DO_ORN
156 #undef DO_NOR
157 #undef DO_NAND
158 #undef DO_SEL
159 #undef LOGICAL_PPPP
160 
161 /* Fully general three-operand expander, controlled by a predicate.
162  * This is complicated by the host-endian storage of the register file.
163  */
164 /* ??? I don't expect the compiler could ever vectorize this itself.
165  * With some tables we can convert bit masks to byte masks, and with
166  * extra care wrt byte/word ordering we could use gcc generic vectors
167  * and do 16 bytes at a time.
168  */
169 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
170 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
171 {                                                                       \
172     intptr_t i, opr_sz = simd_oprsz(desc);                              \
173     for (i = 0; i < opr_sz; ) {                                         \
174         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
175         do {                                                            \
176             if (pg & 1) {                                               \
177                 TYPE nn = *(TYPE *)(vn + H(i));                         \
178                 TYPE mm = *(TYPE *)(vm + H(i));                         \
179                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
180             }                                                           \
181             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
182         } while (i & 15);                                               \
183     }                                                                   \
184 }
185 
186 /* Similarly, specialized for 64-bit operands.  */
187 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
189 {                                                               \
190     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
191     TYPE *d = vd, *n = vn, *m = vm;                             \
192     uint8_t *pg = vg;                                           \
193     for (i = 0; i < opr_sz; i += 1) {                           \
194         if (pg[H1(i)] & 1) {                                    \
195             TYPE nn = n[i], mm = m[i];                          \
196             d[i] = OP(nn, mm);                                  \
197         }                                                       \
198     }                                                           \
199 }
200 
201 #define DO_AND(N, M)  (N & M)
202 #define DO_EOR(N, M)  (N ^ M)
203 #define DO_ORR(N, M)  (N | M)
204 #define DO_BIC(N, M)  (N & ~M)
205 #define DO_ADD(N, M)  (N + M)
206 #define DO_SUB(N, M)  (N - M)
207 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
208 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
209 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
210 #define DO_MUL(N, M)  (N * M)
211 
212 
213 /*
214  * We must avoid the C undefined behaviour cases: division by
215  * zero and signed division of INT_MIN by -1. Both of these
216  * have architecturally defined required results for Arm.
217  * We special case all signed divisions by -1 to avoid having
218  * to deduce the minimum integer for the type involved.
219  */
220 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
221 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
222 
223 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
224 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
225 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
226 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
227 
228 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
229 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
230 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
231 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
232 
233 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
234 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
235 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
236 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
237 
238 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
239 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
240 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
241 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
242 
243 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
244 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
245 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
246 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
247 
248 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
249 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
250 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
251 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
252 
253 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
254 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
255 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
256 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
257 
258 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
259 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
260 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
261 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
262 
263 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
264 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
265 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
266 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
267 
268 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
269 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
270 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
271 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
272 
273 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
274 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
275 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
276 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
277 
278 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
279 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
280 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
281 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
282 
283 /* Because the computation type is at least twice as large as required,
284    these work for both signed and unsigned source types.  */
285 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
286 {
287     return (n * m) >> 8;
288 }
289 
290 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
291 {
292     return (n * m) >> 16;
293 }
294 
295 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
296 {
297     return (n * m) >> 32;
298 }
299 
300 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
301 {
302     uint64_t lo, hi;
303     muls64(&lo, &hi, n, m);
304     return hi;
305 }
306 
307 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
308 {
309     uint64_t lo, hi;
310     mulu64(&lo, &hi, n, m);
311     return hi;
312 }
313 
314 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
315 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
316 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
317 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
318 
319 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
320 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
321 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
322 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
323 
324 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
325 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
326 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
327 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
328 
329 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
330 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
331 
332 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
333 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
334 
335 /* Note that all bits of the shift are significant
336    and not modulo the element size.  */
337 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
338 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
339 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
340 
341 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
342 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
343 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
344 
345 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
346 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
347 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
348 
349 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
350 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
351 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
352 
353 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
354 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
355 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
356 
357 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
358 {
359     int8_t n1 = n, n2 = n >> 8;
360     return m + n1 + n2;
361 }
362 
363 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
364 {
365     int16_t n1 = n, n2 = n >> 16;
366     return m + n1 + n2;
367 }
368 
369 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
370 {
371     int32_t n1 = n, n2 = n >> 32;
372     return m + n1 + n2;
373 }
374 
375 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
376 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
377 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
378 
379 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
380 {
381     uint8_t n1 = n, n2 = n >> 8;
382     return m + n1 + n2;
383 }
384 
385 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
386 {
387     uint16_t n1 = n, n2 = n >> 16;
388     return m + n1 + n2;
389 }
390 
391 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
392 {
393     uint32_t n1 = n, n2 = n >> 32;
394     return m + n1 + n2;
395 }
396 
397 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
398 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
399 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
400 
401 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
402 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
403 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
404 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
405 
406 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
407 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
408 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
409 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
410 
411 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
412 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
413 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
414 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
415 
416 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
417 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
418 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
419 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
420 
421 /*
422  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
423  * We pass in a pointer to a dummy saturation field to trigger
424  * the saturating arithmetic but discard the information about
425  * whether it has occurred.
426  */
427 #define do_sqshl_b(n, m) \
428    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
429 #define do_sqshl_h(n, m) \
430    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
431 #define do_sqshl_s(n, m) \
432    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
433 #define do_sqshl_d(n, m) \
434    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
435 
436 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
437 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
438 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
439 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
440 
441 #define do_uqshl_b(n, m) \
442    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
443 #define do_uqshl_h(n, m) \
444    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
445 #define do_uqshl_s(n, m) \
446    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
447 #define do_uqshl_d(n, m) \
448    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
449 
450 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
451 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
452 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
453 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
454 
455 #define do_sqrshl_b(n, m) \
456    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
457 #define do_sqrshl_h(n, m) \
458    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
459 #define do_sqrshl_s(n, m) \
460    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
461 #define do_sqrshl_d(n, m) \
462    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
463 
464 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
465 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
466 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
467 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
468 
469 #undef do_sqrshl_d
470 
471 #define do_uqrshl_b(n, m) \
472    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
473 #define do_uqrshl_h(n, m) \
474    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
475 #define do_uqrshl_s(n, m) \
476    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
477 #define do_uqrshl_d(n, m) \
478    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
479 
480 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
481 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
482 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
483 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
484 
485 #undef do_uqrshl_d
486 
487 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
488 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
489 
490 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
491 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
492 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
493 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
494 
495 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
496 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
497 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
498 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
499 
500 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
501 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
502 
503 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
504 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
506 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
507 
508 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
509 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
510 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
511 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
512 
513 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
514 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
515 
516 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
517 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
519 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
520 
521 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
522 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
523 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
524 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
525 
526 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
527 {
528     return val >= max ? max : val <= min ? min : val;
529 }
530 
531 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
532 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
533 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
534 
535 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
536 {
537     int64_t r = n + m;
538     if (((r ^ n) & ~(n ^ m)) < 0) {
539         /* Signed overflow.  */
540         return r < 0 ? INT64_MAX : INT64_MIN;
541     }
542     return r;
543 }
544 
545 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
546 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
547 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
548 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
549 
550 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
551 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
552 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
553 
554 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
555 {
556     uint64_t r = n + m;
557     return r < n ? UINT64_MAX : r;
558 }
559 
560 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
561 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
562 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
563 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
564 
565 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
566 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
567 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
568 
569 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
570 {
571     int64_t r = n - m;
572     if (((r ^ n) & (n ^ m)) < 0) {
573         /* Signed overflow.  */
574         return r < 0 ? INT64_MAX : INT64_MIN;
575     }
576     return r;
577 }
578 
579 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
580 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
581 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
582 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
583 
584 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
585 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
586 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
587 
588 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
589 {
590     return n > m ? n - m : 0;
591 }
592 
593 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
594 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
595 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
596 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
597 
598 #define DO_SUQADD_B(n, m) \
599     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
600 #define DO_SUQADD_H(n, m) \
601     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
602 #define DO_SUQADD_S(n, m) \
603     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
604 
605 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
606 {
607     uint64_t r = n + m;
608 
609     if (n < 0) {
610         /* Note that m - abs(n) cannot underflow. */
611         if (r > INT64_MAX) {
612             /* Result is either very large positive or negative. */
613             if (m > -n) {
614                 /* m > abs(n), so r is a very large positive. */
615                 return INT64_MAX;
616             }
617             /* Result is negative. */
618         }
619     } else {
620         /* Both inputs are positive: check for overflow.  */
621         if (r < m || r > INT64_MAX) {
622             return INT64_MAX;
623         }
624     }
625     return r;
626 }
627 
628 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
629 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
630 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
631 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
632 
633 #define DO_USQADD_B(n, m) \
634     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
635 #define DO_USQADD_H(n, m) \
636     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
637 #define DO_USQADD_S(n, m) \
638     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
639 
640 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
641 {
642     uint64_t r = n + m;
643 
644     if (m < 0) {
645         return n < -m ? 0 : r;
646     }
647     return r < n ? UINT64_MAX : r;
648 }
649 
650 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
651 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
652 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
653 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
654 
655 #undef DO_ZPZZ
656 #undef DO_ZPZZ_D
657 
658 /*
659  * Three operand expander, operating on element pairs.
660  * If the slot I is even, the elements from from VN {I, I+1}.
661  * If the slot I is odd, the elements from from VM {I-1, I}.
662  * Load all of the input elements in each pair before overwriting output.
663  */
664 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
665 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
666 {                                                               \
667     intptr_t i, opr_sz = simd_oprsz(desc);                      \
668     for (i = 0; i < opr_sz; ) {                                 \
669         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
670         do {                                                    \
671             TYPE n0 = *(TYPE *)(vn + H(i));                     \
672             TYPE m0 = *(TYPE *)(vm + H(i));                     \
673             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
674             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
675             if (pg & 1) {                                       \
676                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
677             }                                                   \
678             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
679             if (pg & 1) {                                       \
680                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
681             }                                                   \
682             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
683         } while (i & 15);                                       \
684     }                                                           \
685 }
686 
687 /* Similarly, specialized for 64-bit operands.  */
688 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
689 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
690 {                                                               \
691     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
692     TYPE *d = vd, *n = vn, *m = vm;                             \
693     uint8_t *pg = vg;                                           \
694     for (i = 0; i < opr_sz; i += 2) {                           \
695         TYPE n0 = n[i], n1 = n[i + 1];                          \
696         TYPE m0 = m[i], m1 = m[i + 1];                          \
697         if (pg[H1(i)] & 1) {                                    \
698             d[i] = OP(n0, n1);                                  \
699         }                                                       \
700         if (pg[H1(i + 1)] & 1) {                                \
701             d[i + 1] = OP(m0, m1);                              \
702         }                                                       \
703     }                                                           \
704 }
705 
706 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
707 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
708 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
709 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
710 
711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
714 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
715 
716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
719 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
720 
721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
724 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
725 
726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
729 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
730 
731 #undef DO_ZPZZ_PAIR
732 #undef DO_ZPZZ_PAIR_D
733 
734 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
735 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
736                   float_status *status, uint32_t desc)                  \
737 {                                                                       \
738     intptr_t i, opr_sz = simd_oprsz(desc);                              \
739     for (i = 0; i < opr_sz; ) {                                         \
740         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
741         do {                                                            \
742             TYPE n0 = *(TYPE *)(vn + H(i));                             \
743             TYPE m0 = *(TYPE *)(vm + H(i));                             \
744             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
745             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
746             if (pg & 1) {                                               \
747                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
748             }                                                           \
749             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
750             if (pg & 1) {                                               \
751                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
752             }                                                           \
753             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
754         } while (i & 15);                                               \
755     }                                                                   \
756 }
757 
758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
761 
762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
765 
766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
769 
770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
773 
774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
777 
778 #undef DO_ZPZZ_PAIR_FP
779 
780 /* Three-operand expander, controlled by a predicate, in which the
781  * third operand is "wide".  That is, for D = N op M, the same 64-bit
782  * value of M is used with all of the narrower values of N.
783  */
784 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
785 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
786 {                                                                       \
787     intptr_t i, opr_sz = simd_oprsz(desc);                              \
788     for (i = 0; i < opr_sz; ) {                                         \
789         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
790         TYPEW mm = *(TYPEW *)(vm + i);                                  \
791         do {                                                            \
792             if (pg & 1) {                                               \
793                 TYPE nn = *(TYPE *)(vn + H(i));                         \
794                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
795             }                                                           \
796             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
797         } while (i & 7);                                                \
798     }                                                                   \
799 }
800 
801 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
802 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
803 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
804 
805 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
806 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
807 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
808 
809 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
810 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
811 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
812 
813 #undef DO_ZPZW
814 
815 /* Fully general two-operand expander, controlled by a predicate.
816  */
817 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
818 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
819 {                                                               \
820     intptr_t i, opr_sz = simd_oprsz(desc);                      \
821     for (i = 0; i < opr_sz; ) {                                 \
822         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
823         do {                                                    \
824             if (pg & 1) {                                       \
825                 TYPE nn = *(TYPE *)(vn + H(i));                 \
826                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
827             }                                                   \
828             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
829         } while (i & 15);                                       \
830     }                                                           \
831 }
832 
833 /* Similarly, specialized for 64-bit operands.  */
834 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
835 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
836 {                                                               \
837     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
838     TYPE *d = vd, *n = vn;                                      \
839     uint8_t *pg = vg;                                           \
840     for (i = 0; i < opr_sz; i += 1) {                           \
841         if (pg[H1(i)] & 1) {                                    \
842             TYPE nn = n[i];                                     \
843             d[i] = OP(nn);                                      \
844         }                                                       \
845     }                                                           \
846 }
847 
848 #define DO_CLS_B(N)   (clrsb32(N) - 24)
849 #define DO_CLS_H(N)   (clrsb32(N) - 16)
850 
851 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
852 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
853 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
854 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
855 
856 #define DO_CLZ_B(N)   (clz32(N) - 24)
857 #define DO_CLZ_H(N)   (clz32(N) - 16)
858 
859 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
860 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
861 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
862 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
863 
864 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
865 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
866 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
867 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
868 
869 #define DO_CNOT(N)    (N == 0)
870 
871 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
872 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
873 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
874 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
875 
876 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
877 
878 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
879 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
880 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
881 
882 #define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N))
883 #define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N))
884 #define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N))
885 
886 DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H)
887 DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S)
888 DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D)
889 
890 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
891 
892 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
893 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
894 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
895 
896 #define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N))
897 #define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N))
898 #define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N))
899 
900 DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H)
901 DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S)
902 DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D)
903 
904 #define DO_NOT(N)    (~N)
905 
906 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
907 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
908 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
909 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
910 
911 #define DO_SXTB(N)    ((int8_t)N)
912 #define DO_SXTH(N)    ((int16_t)N)
913 #define DO_SXTS(N)    ((int32_t)N)
914 #define DO_UXTB(N)    ((uint8_t)N)
915 #define DO_UXTH(N)    ((uint16_t)N)
916 #define DO_UXTS(N)    ((uint32_t)N)
917 
918 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
919 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
920 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
921 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
922 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
923 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
924 
925 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
926 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
927 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
928 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
929 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
930 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
931 
932 #define DO_ABS(N)    (N < 0 ? -N : N)
933 
934 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
935 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
936 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
937 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
938 
939 #define DO_NEG(N)    (-N)
940 
941 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
942 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
943 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
944 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
945 
946 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
947 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
948 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
949 
950 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
951 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
952 
953 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
954 
955 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
956 {
957     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
958     uint64_t *d = vd, *n = vn;
959     uint8_t *pg = vg;
960 
961     for (i = 0; i < opr_sz; i += 2) {
962         if (pg[H1(i)] & 1) {
963             uint64_t n0 = n[i + 0];
964             uint64_t n1 = n[i + 1];
965             d[i + 0] = n1;
966             d[i + 1] = n0;
967         }
968     }
969 }
970 
971 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
972 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
973 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
974 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
975 
976 #define DO_SQABS(X) \
977     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
978        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
979 
980 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
981 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
982 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
983 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
984 
985 #define DO_SQNEG(X) \
986     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
987        x_ == min_ ? -min_ - 1 : -x_; })
988 
989 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
990 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
991 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
992 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
993 
994 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
995 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
996 
997 /* Three-operand expander, unpredicated, in which the third operand is "wide".
998  */
999 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
1000 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1001 {                                                              \
1002     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1003     for (i = 0; i < opr_sz; ) {                                \
1004         TYPEW mm = *(TYPEW *)(vm + i);                         \
1005         do {                                                   \
1006             TYPE nn = *(TYPE *)(vn + H(i));                    \
1007             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
1008             i += sizeof(TYPE);                                 \
1009         } while (i & 7);                                       \
1010     }                                                          \
1011 }
1012 
1013 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1014 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1015 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1016 
1017 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1018 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1019 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1020 
1021 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1022 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1023 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1024 
1025 #undef DO_ZZW
1026 
1027 #undef DO_CLS_B
1028 #undef DO_CLS_H
1029 #undef DO_CLZ_B
1030 #undef DO_CLZ_H
1031 #undef DO_CNOT
1032 #undef DO_FABS
1033 #undef DO_FNEG
1034 #undef DO_ABS
1035 #undef DO_NEG
1036 #undef DO_ZPZ
1037 #undef DO_ZPZ_D
1038 
1039 /*
1040  * Three-operand expander, unpredicated, in which the two inputs are
1041  * selected from the top or bottom half of the wide column.
1042  */
1043 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1044 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1045 {                                                                       \
1046     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1047     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1048     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1049     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1050         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1051         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1052         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1053     }                                                                   \
1054 }
1055 
1056 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1057 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1058 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1059 
1060 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1061 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1062 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1063 
1064 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1065 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1066 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1067 
1068 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1069 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1070 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1071 
1072 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1073 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1074 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1075 
1076 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1077 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1078 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1079 
1080 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1081 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1082 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1083 
1084 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1085 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1086 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1087 
1088 /* Note that the multiply cannot overflow, but the doubling can. */
1089 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1090 {
1091     int16_t val = n * m;
1092     return DO_SQADD_H(val, val);
1093 }
1094 
1095 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1096 {
1097     int32_t val = n * m;
1098     return DO_SQADD_S(val, val);
1099 }
1100 
1101 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1102 {
1103     int64_t val = n * m;
1104     return do_sqadd_d(val, val);
1105 }
1106 
1107 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1108 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1109 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1110 
1111 #undef DO_ZZZ_TB
1112 
1113 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1114 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1115 {                                                              \
1116     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1117     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1118     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1119         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1120         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1121         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1122     }                                                          \
1123 }
1124 
1125 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1126 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1127 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1128 
1129 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1130 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1131 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1132 
1133 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1134 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1135 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1136 
1137 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1138 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1139 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1140 
1141 #undef DO_ZZZ_WTB
1142 
1143 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1144 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1145 {                                                                       \
1146     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1147     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1148     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1149     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1150         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1151         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1152         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1153     }                                                                   \
1154 }
1155 
1156 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1157 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1158 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1159 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1160 
1161 #undef DO_ZZZ_NTB
1162 
1163 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1164 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1165 {                                                               \
1166     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1167     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1168     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1169         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1170         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1171         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1172         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1173     }                                                           \
1174 }
1175 
1176 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1177 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1178 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1179 
1180 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1181 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1182 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1183 
1184 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1185 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1186 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1187 
1188 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1189 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1190 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1191 
1192 #define DO_NMUL(N, M)  -(N * M)
1193 
1194 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1195 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1196 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1197 
1198 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1199 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1200 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1201 
1202 #undef DO_ZZZW_ACC
1203 
1204 #define DO_XTNB(NAME, TYPE, OP) \
1205 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1206 {                                                            \
1207     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1208     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1209         TYPE nn = *(TYPE *)(vn + i);                         \
1210         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1211         *(TYPE *)(vd + i) = nn;                              \
1212     }                                                        \
1213 }
1214 
1215 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1216 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1217 {                                                                       \
1218     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1219     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1220         TYPE nn = *(TYPE *)(vn + i);                                    \
1221         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1222     }                                                                   \
1223 }
1224 
1225 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1226 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1227 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1228 
1229 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1230 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1231 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1232 
1233 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1234 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1235 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1236 
1237 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1238 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1239 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1240 
1241 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1242 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1243 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1244 
1245 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1246 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1247 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1248 
1249 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1250 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1251 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1252 
1253 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1254 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1255 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1256 
1257 #undef DO_XTNB
1258 #undef DO_XTNT
1259 
1260 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1261 {
1262     intptr_t i, opr_sz = simd_oprsz(desc);
1263     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1264     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1265     uint32_t *a = va, *n = vn;
1266     uint64_t *d = vd, *m = vm;
1267 
1268     for (i = 0; i < opr_sz / 8; ++i) {
1269         uint32_t e1 = a[2 * i + H4(0)];
1270         uint32_t e2 = n[2 * i + sel] ^ inv;
1271         uint64_t c = extract64(m[i], 32, 1);
1272         /* Compute and store the entire 33-bit result at once. */
1273         d[i] = c + e1 + e2;
1274     }
1275 }
1276 
1277 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1278 {
1279     intptr_t i, opr_sz = simd_oprsz(desc);
1280     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1281     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1282     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1283 
1284     for (i = 0; i < opr_sz / 8; i += 2) {
1285         Int128 e1 = int128_make64(a[i]);
1286         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1287         Int128 c = int128_make64(m[i + 1] & 1);
1288         Int128 r = int128_add(int128_add(e1, e2), c);
1289         d[i + 0] = int128_getlo(r);
1290         d[i + 1] = int128_gethi(r);
1291     }
1292 }
1293 
1294 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1295 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1296 {                                                                       \
1297     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1298     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1299     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1300     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1301         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1302         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1303         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1304         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1305     }                                                                   \
1306 }
1307 
1308 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1309            do_sqdmull_h, DO_SQADD_H)
1310 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1311            do_sqdmull_s, DO_SQADD_S)
1312 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1313            do_sqdmull_d, do_sqadd_d)
1314 
1315 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1316            do_sqdmull_h, DO_SQSUB_H)
1317 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1318            do_sqdmull_s, DO_SQSUB_S)
1319 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1320            do_sqdmull_d, do_sqsub_d)
1321 
1322 #undef DO_SQDMLAL
1323 
1324 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1325 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1326 {                                                               \
1327     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1328     int rot = simd_data(desc);                                  \
1329     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1330     bool sub_r = rot == 1 || rot == 2;                          \
1331     bool sub_i = rot >= 2;                                      \
1332     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1333     for (i = 0; i < opr_sz; i += 2) {                           \
1334         TYPE elt1_a = n[H(i + sel_a)];                          \
1335         TYPE elt2_a = m[H(i + sel_a)];                          \
1336         TYPE elt2_b = m[H(i + sel_b)];                          \
1337         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1338         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1339     }                                                           \
1340 }
1341 
1342 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1343 
1344 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1345 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1346 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1347 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1348 
1349 #define DO_SQRDMLAH_B(N, M, A, S) \
1350     do_sqrdmlah_b(N, M, A, S, true)
1351 #define DO_SQRDMLAH_H(N, M, A, S) \
1352     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1353 #define DO_SQRDMLAH_S(N, M, A, S) \
1354     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1355 #define DO_SQRDMLAH_D(N, M, A, S) \
1356     do_sqrdmlah_d(N, M, A, S, true)
1357 
1358 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1359 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1360 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1361 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1362 
1363 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1364 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1365 {                                                                           \
1366     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1367     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1368     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1369     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1370     bool sub_r = rot == 1 || rot == 2;                                      \
1371     bool sub_i = rot >= 2;                                                  \
1372     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1373     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1374         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1375         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1376         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1377             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1378             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1379             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1380         }                                                                   \
1381     }                                                                       \
1382 }
1383 
1384 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1385 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1386 
1387 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1388 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1389 
1390 #undef DO_CMLA
1391 #undef DO_CMLA_FUNC
1392 #undef DO_CMLA_IDX_FUNC
1393 #undef DO_SQRDMLAH_B
1394 #undef DO_SQRDMLAH_H
1395 #undef DO_SQRDMLAH_S
1396 #undef DO_SQRDMLAH_D
1397 
1398 /* Note N and M are 4 elements bundled into one unit. */
1399 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1400                          int sel_a, int sel_b, int sub_i)
1401 {
1402     for (int i = 0; i <= 1; i++) {
1403         int32_t elt1_r = (int8_t)(n >> (16 * i));
1404         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1405         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1406         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1407 
1408         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1409     }
1410     return a;
1411 }
1412 
1413 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1414                          int sel_a, int sel_b, int sub_i)
1415 {
1416     for (int i = 0; i <= 1; i++) {
1417         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1418         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1419         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1420         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1421 
1422         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1423     }
1424     return a;
1425 }
1426 
1427 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1428                               void *va, uint32_t desc)
1429 {
1430     int opr_sz = simd_oprsz(desc);
1431     int rot = simd_data(desc);
1432     int sel_a = rot & 1;
1433     int sel_b = sel_a ^ 1;
1434     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1435     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1436 
1437     for (int e = 0; e < opr_sz / 4; e++) {
1438         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1439     }
1440 }
1441 
1442 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1443                               void *va, uint32_t desc)
1444 {
1445     int opr_sz = simd_oprsz(desc);
1446     int rot = simd_data(desc);
1447     int sel_a = rot & 1;
1448     int sel_b = sel_a ^ 1;
1449     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1450     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1451 
1452     for (int e = 0; e < opr_sz / 8; e++) {
1453         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1454     }
1455 }
1456 
1457 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1458                              void *va, uint32_t desc)
1459 {
1460     int opr_sz = simd_oprsz(desc);
1461     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1462     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1463     int sel_a = rot & 1;
1464     int sel_b = sel_a ^ 1;
1465     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1466     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1467 
1468     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1469         uint32_t seg_m = m[seg + idx];
1470         for (int e = 0; e < 4; e++) {
1471             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1472                                    sel_a, sel_b, sub_i);
1473         }
1474     }
1475 }
1476 
1477 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1478                              void *va, uint32_t desc)
1479 {
1480     int seg, opr_sz = simd_oprsz(desc);
1481     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1482     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1483     int sel_a = rot & 1;
1484     int sel_b = sel_a ^ 1;
1485     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1486     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1487 
1488     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1489         uint64_t seg_m = m[seg + idx];
1490         for (int e = 0; e < 2; e++) {
1491             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1492                                    sel_a, sel_b, sub_i);
1493         }
1494     }
1495 }
1496 
1497 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1498 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1499 {                                                                       \
1500     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1501     intptr_t i, j, idx = simd_data(desc);                               \
1502     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1503     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1504         TYPE mm = m[i];                                                 \
1505         for (j = 0; j < segment; j++) {                                 \
1506             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1507         }                                                               \
1508     }                                                                   \
1509 }
1510 
1511 #define DO_SQRDMLAH_H(N, M, A) \
1512     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1513 #define DO_SQRDMLAH_S(N, M, A) \
1514     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1515 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1516 
1517 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1518 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1519 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1520 
1521 #define DO_SQRDMLSH_H(N, M, A) \
1522     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1523 #define DO_SQRDMLSH_S(N, M, A) \
1524     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1525 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1526 
1527 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1528 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1529 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1530 
1531 #undef DO_ZZXZ
1532 
1533 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1534 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1535 {                                                                         \
1536     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1537     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1538     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1539     for (i = 0; i < oprsz; i += 16) {                                     \
1540         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1541         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1542             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1543             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1544             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1545         }                                                                 \
1546     }                                                                     \
1547 }
1548 
1549 #define DO_MLA(N, M, A)  (A + N * M)
1550 
1551 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1552 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1553 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1554 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1555 
1556 #define DO_MLS(N, M, A)  (A - N * M)
1557 
1558 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1559 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1560 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1561 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1562 
1563 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1564 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1565 
1566 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1567 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1568 
1569 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1570 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1571 
1572 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1573 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1574 
1575 #undef DO_MLA
1576 #undef DO_MLS
1577 #undef DO_ZZXW
1578 
1579 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1580 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1581 {                                                                         \
1582     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1583     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1584     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1585     for (i = 0; i < oprsz; i += 16) {                                     \
1586         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1587         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1588             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1589             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1590         }                                                                 \
1591     }                                                                     \
1592 }
1593 
1594 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1595 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1596 
1597 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1598 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1599 
1600 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1601 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1602 
1603 #undef DO_ZZX
1604 
1605 #define DO_BITPERM(NAME, TYPE, OP) \
1606 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1607 {                                                              \
1608     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1609     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1610         TYPE nn = *(TYPE *)(vn + i);                           \
1611         TYPE mm = *(TYPE *)(vm + i);                           \
1612         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1613     }                                                          \
1614 }
1615 
1616 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1617 {
1618     uint64_t res = 0;
1619     int db, rb = 0;
1620 
1621     for (db = 0; db < n; ++db) {
1622         if ((mask >> db) & 1) {
1623             res |= ((data >> db) & 1) << rb;
1624             ++rb;
1625         }
1626     }
1627     return res;
1628 }
1629 
1630 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1631 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1632 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1633 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1634 
1635 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1636 {
1637     uint64_t res = 0;
1638     int rb, db = 0;
1639 
1640     for (rb = 0; rb < n; ++rb) {
1641         if ((mask >> rb) & 1) {
1642             res |= ((data >> db) & 1) << rb;
1643             ++db;
1644         }
1645     }
1646     return res;
1647 }
1648 
1649 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1650 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1651 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1652 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1653 
1654 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1655 {
1656     uint64_t resm = 0, resu = 0;
1657     int db, rbm = 0, rbu = 0;
1658 
1659     for (db = 0; db < n; ++db) {
1660         uint64_t val = (data >> db) & 1;
1661         if ((mask >> db) & 1) {
1662             resm |= val << rbm++;
1663         } else {
1664             resu |= val << rbu++;
1665         }
1666     }
1667 
1668     return resm | (resu << rbm);
1669 }
1670 
1671 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1672 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1673 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1674 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1675 
1676 #undef DO_BITPERM
1677 
1678 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1679 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1680 {                                                               \
1681     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1682     int sub_r = simd_data(desc);                                \
1683     if (sub_r) {                                                \
1684         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1685             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1686             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1687             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1688             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1689             acc_r = ADD_OP(acc_r, el2_i);                       \
1690             acc_i = SUB_OP(acc_i, el2_r);                       \
1691             *(TYPE *)(vd + H(i)) = acc_r;                       \
1692             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1693         }                                                       \
1694     } else {                                                    \
1695         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1696             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1697             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1698             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1699             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1700             acc_r = SUB_OP(acc_r, el2_i);                       \
1701             acc_i = ADD_OP(acc_i, el2_r);                       \
1702             *(TYPE *)(vd + H(i)) = acc_r;                       \
1703             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1704         }                                                       \
1705     }                                                           \
1706 }
1707 
1708 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1709 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1710 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1711 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1712 
1713 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1714 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1715 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1716 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1717 
1718 #undef DO_CADD
1719 
1720 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1721 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1722 {                                                              \
1723     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1724     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1725     int shift = simd_data(desc) >> 1;                          \
1726     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1727         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1728         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1729     }                                                          \
1730 }
1731 
1732 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1733 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1734 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1735 
1736 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1737 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1738 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1739 
1740 #undef DO_ZZI_SHLL
1741 
1742 /* Two-operand reduction expander, controlled by a predicate.
1743  * The difference between TYPERED and TYPERET has to do with
1744  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1745  * but TYPERET must be unsigned so that e.g. a 32-bit value
1746  * is not sign-extended to the ABI uint64_t return type.
1747  */
1748 /* ??? If we were to vectorize this by hand the reduction ordering
1749  * would change.  For integer operands, this is perfectly fine.
1750  */
1751 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1752 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1753 {                                                          \
1754     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1755     TYPERED ret = INIT;                                    \
1756     for (i = 0; i < opr_sz; ) {                            \
1757         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1758         do {                                               \
1759             if (pg & 1) {                                  \
1760                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1761                 ret = OP(ret, nn);                         \
1762             }                                              \
1763             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1764         } while (i & 15);                                  \
1765     }                                                      \
1766     return (TYPERET)ret;                                   \
1767 }
1768 
1769 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1770 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1771 {                                                          \
1772     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1773     TYPEE *n = vn;                                         \
1774     uint8_t *pg = vg;                                      \
1775     TYPER ret = INIT;                                      \
1776     for (i = 0; i < opr_sz; i += 1) {                      \
1777         if (pg[H1(i)] & 1) {                               \
1778             TYPEE nn = n[i];                               \
1779             ret = OP(ret, nn);                             \
1780         }                                                  \
1781     }                                                      \
1782     return ret;                                            \
1783 }
1784 
1785 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1786 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1787 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1788 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1789 
1790 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1791 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1792 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1793 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1794 
1795 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1796 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1797 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1798 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1799 
1800 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1801 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1802 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1803 
1804 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1805 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1806 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1807 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1808 
1809 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1810 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1811 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1812 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1813 
1814 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1815 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1816 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1817 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1818 
1819 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1820 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1821 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1822 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1823 
1824 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1825 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1826 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1827 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1828 
1829 #undef DO_VPZ
1830 #undef DO_VPZ_D
1831 
1832 /* Two vector operand, one scalar operand, unpredicated.  */
1833 #define DO_ZZI(NAME, TYPE, OP)                                       \
1834 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1835 {                                                                    \
1836     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1837     TYPE s = s64, *d = vd, *n = vn;                                  \
1838     for (i = 0; i < opr_sz; ++i) {                                   \
1839         d[i] = OP(n[i], s);                                          \
1840     }                                                                \
1841 }
1842 
1843 #define DO_SUBR(X, Y)   (Y - X)
1844 
1845 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1846 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1847 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1848 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1849 
1850 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1851 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1852 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1853 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1854 
1855 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1856 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1857 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1858 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1859 
1860 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1861 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1862 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1863 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1864 
1865 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1866 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1867 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1868 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1869 
1870 #undef DO_ZZI
1871 
1872 #undef DO_AND
1873 #undef DO_ORR
1874 #undef DO_EOR
1875 #undef DO_BIC
1876 #undef DO_ADD
1877 #undef DO_SUB
1878 #undef DO_MAX
1879 #undef DO_MIN
1880 #undef DO_ABD
1881 #undef DO_MUL
1882 #undef DO_DIV
1883 #undef DO_ASR
1884 #undef DO_LSR
1885 #undef DO_LSL
1886 #undef DO_SUBR
1887 
1888 /* Similar to the ARM LastActiveElement pseudocode function, except the
1889    result is multiplied by the element size.  This includes the not found
1890    indication; e.g. not found for esz=3 is -8.  */
1891 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1892 {
1893     uint64_t mask = pred_esz_masks[esz];
1894     intptr_t i = words;
1895 
1896     do {
1897         uint64_t this_g = g[--i] & mask;
1898         if (this_g) {
1899             return i * 64 + (63 - clz64(this_g));
1900         }
1901     } while (i > 0);
1902     return (intptr_t)-1 << esz;
1903 }
1904 
1905 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1906 {
1907     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1908     uint32_t flags = PREDTEST_INIT;
1909     uint64_t *d = vd, *g = vg;
1910     intptr_t i = 0;
1911 
1912     do {
1913         uint64_t this_d = d[i];
1914         uint64_t this_g = g[i];
1915 
1916         if (this_g) {
1917             if (!(flags & 4)) {
1918                 /* Set in D the first bit of G.  */
1919                 this_d |= this_g & -this_g;
1920                 d[i] = this_d;
1921             }
1922             flags = iter_predtest_fwd(this_d, this_g, flags);
1923         }
1924     } while (++i < words);
1925 
1926     return flags;
1927 }
1928 
1929 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1930 {
1931     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1932     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1933     uint32_t flags = PREDTEST_INIT;
1934     uint64_t *d = vd, *g = vg, esz_mask;
1935     intptr_t i, next;
1936 
1937     next = last_active_element(vd, words, esz) + (1 << esz);
1938     esz_mask = pred_esz_masks[esz];
1939 
1940     /* Similar to the pseudocode for pnext, but scaled by ESZ
1941        so that we find the correct bit.  */
1942     if (next < words * 64) {
1943         uint64_t mask = -1;
1944 
1945         if (next & 63) {
1946             mask = ~((1ull << (next & 63)) - 1);
1947             next &= -64;
1948         }
1949         do {
1950             uint64_t this_g = g[next / 64] & esz_mask & mask;
1951             if (this_g != 0) {
1952                 next = (next & -64) + ctz64(this_g);
1953                 break;
1954             }
1955             next += 64;
1956             mask = -1;
1957         } while (next < words * 64);
1958     }
1959 
1960     i = 0;
1961     do {
1962         uint64_t this_d = 0;
1963         if (i == next / 64) {
1964             this_d = 1ull << (next & 63);
1965         }
1966         d[i] = this_d;
1967         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1968     } while (++i < words);
1969 
1970     return flags;
1971 }
1972 
1973 /*
1974  * Copy Zn into Zd, and store zero into inactive elements.
1975  * If inv, store zeros into the active elements.
1976  */
1977 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1978 {
1979     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1980     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1981     uint64_t *d = vd, *n = vn;
1982     uint8_t *pg = vg;
1983 
1984     for (i = 0; i < opr_sz; i += 1) {
1985         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1986     }
1987 }
1988 
1989 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1990 {
1991     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1992     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1993     uint64_t *d = vd, *n = vn;
1994     uint8_t *pg = vg;
1995 
1996     for (i = 0; i < opr_sz; i += 1) {
1997         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1998     }
1999 }
2000 
2001 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
2002 {
2003     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2004     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
2005     uint64_t *d = vd, *n = vn;
2006     uint8_t *pg = vg;
2007 
2008     for (i = 0; i < opr_sz; i += 1) {
2009         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
2010     }
2011 }
2012 
2013 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
2014 {
2015     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2016     uint64_t *d = vd, *n = vn;
2017     uint8_t *pg = vg;
2018     uint8_t inv = simd_data(desc);
2019 
2020     for (i = 0; i < opr_sz; i += 1) {
2021         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2022     }
2023 }
2024 
2025 /* Three-operand expander, immediate operand, controlled by a predicate.
2026  */
2027 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2028 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2029 {                                                               \
2030     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2031     TYPE imm = simd_data(desc);                                 \
2032     for (i = 0; i < opr_sz; ) {                                 \
2033         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2034         do {                                                    \
2035             if (pg & 1) {                                       \
2036                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2037                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2038             }                                                   \
2039             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2040         } while (i & 15);                                       \
2041     }                                                           \
2042 }
2043 
2044 /* Similarly, specialized for 64-bit operands.  */
2045 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2046 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2047 {                                                               \
2048     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2049     TYPE *d = vd, *n = vn;                                      \
2050     TYPE imm = simd_data(desc);                                 \
2051     uint8_t *pg = vg;                                           \
2052     for (i = 0; i < opr_sz; i += 1) {                           \
2053         if (pg[H1(i)] & 1) {                                    \
2054             TYPE nn = n[i];                                     \
2055             d[i] = OP(nn, imm);                                 \
2056         }                                                       \
2057     }                                                           \
2058 }
2059 
2060 #define DO_SHR(N, M)  (N >> M)
2061 #define DO_SHL(N, M)  (N << M)
2062 
2063 /* Arithmetic shift right for division.  This rounds negative numbers
2064    toward zero as per signed division.  Therefore before shifting,
2065    when N is negative, add 2**M-1.  */
2066 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2067 
2068 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2069 {
2070     if (likely(sh < 64)) {
2071         return (x >> sh) + ((x >> (sh - 1)) & 1);
2072     } else if (sh == 64) {
2073         return x >> 63;
2074     } else {
2075         return 0;
2076     }
2077 }
2078 
2079 static inline int64_t do_srshr(int64_t x, unsigned sh)
2080 {
2081     if (likely(sh < 64)) {
2082         return (x >> sh) + ((x >> (sh - 1)) & 1);
2083     } else {
2084         /* Rounding the sign bit always produces 0. */
2085         return 0;
2086     }
2087 }
2088 
2089 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2090 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2091 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2092 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2093 
2094 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2095 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2096 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2097 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2098 
2099 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2100 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2101 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2102 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2103 
2104 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2105 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2106 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2107 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2108 
2109 /* SVE2 bitwise shift by immediate */
2110 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2111 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2112 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2113 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2114 
2115 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2116 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2117 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2118 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2119 
2120 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2121 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2122 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2123 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2124 
2125 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2126 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2127 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2128 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2129 
2130 #define do_suqrshl_b(n, m) \
2131    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2132 #define do_suqrshl_h(n, m) \
2133    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2134 #define do_suqrshl_s(n, m) \
2135    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2136 #define do_suqrshl_d(n, m) \
2137    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2138 
2139 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2140 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2141 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2142 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2143 
2144 #undef DO_ASRD
2145 #undef DO_ZPZI
2146 #undef DO_ZPZI_D
2147 
2148 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2149 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2150 {                                                            \
2151     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2152     int shift = simd_data(desc);                             \
2153     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2154         TYPEW nn = *(TYPEW *)(vn + i);                       \
2155         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2156     }                                                        \
2157 }
2158 
2159 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2160 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2161 {                                                                 \
2162     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2163     int shift = simd_data(desc);                                  \
2164     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2165         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2166         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2167     }                                                             \
2168 }
2169 
2170 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2171 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2172 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2173 
2174 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2175 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2176 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2177 
2178 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2179 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2180 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2181 
2182 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2183 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2184 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2185 
2186 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2187 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2188 #define DO_SQSHRUN_D(x, sh) \
2189     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2190 
2191 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2192 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2193 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2194 
2195 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2196 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2197 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2198 
2199 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2200 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2201 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2202 
2203 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2204 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2205 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2206 
2207 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2208 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2209 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2210 
2211 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2212 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2213 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2214 
2215 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2216 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2217 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2218 
2219 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2220 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2221 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2222 
2223 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2224 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2225 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2226 
2227 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2228 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2229 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2230 
2231 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2232 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2233 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2234 
2235 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2236 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2237 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2238 
2239 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2240 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2241 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2242 
2243 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2244 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2245 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2246 
2247 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2248 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2249 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2250 
2251 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2252 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2253 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2254 
2255 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2256 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2257 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2258 
2259 #undef DO_SHRNB
2260 #undef DO_SHRNT
2261 
2262 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2263 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2264 {                                                                           \
2265     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2266     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2267         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2268         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2269         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2270     }                                                                       \
2271 }
2272 
2273 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2274 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2275 {                                                                           \
2276     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2277     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2278         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2279         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2280         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2281     }                                                                       \
2282 }
2283 
2284 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2285 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2286 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2287 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2288 
2289 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2290 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2291 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2292 
2293 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2294 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2295 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2296 
2297 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2298 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2299 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2300 
2301 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2302 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2303 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2304 
2305 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2306 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2307 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2308 
2309 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2310 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2311 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2312 
2313 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2314 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2315 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2316 
2317 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2318 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2319 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2320 
2321 #undef DO_RSUBHN
2322 #undef DO_SUBHN
2323 #undef DO_RADDHN
2324 #undef DO_ADDHN
2325 
2326 #undef DO_BINOPNB
2327 
2328 /* Fully general four-operand expander, controlled by a predicate.
2329  */
2330 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2331 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2332                   void *vg, uint32_t desc)                    \
2333 {                                                             \
2334     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2335     for (i = 0; i < opr_sz; ) {                               \
2336         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2337         do {                                                  \
2338             if (pg & 1) {                                     \
2339                 TYPE nn = *(TYPE *)(vn + H(i));               \
2340                 TYPE mm = *(TYPE *)(vm + H(i));               \
2341                 TYPE aa = *(TYPE *)(va + H(i));               \
2342                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2343             }                                                 \
2344             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2345         } while (i & 15);                                     \
2346     }                                                         \
2347 }
2348 
2349 /* Similarly, specialized for 64-bit operands.  */
2350 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2351 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2352                   void *vg, uint32_t desc)                    \
2353 {                                                             \
2354     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2355     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2356     uint8_t *pg = vg;                                         \
2357     for (i = 0; i < opr_sz; i += 1) {                         \
2358         if (pg[H1(i)] & 1) {                                  \
2359             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2360             d[i] = OP(aa, nn, mm);                            \
2361         }                                                     \
2362     }                                                         \
2363 }
2364 
2365 #define DO_MLA(A, N, M)  (A + N * M)
2366 #define DO_MLS(A, N, M)  (A - N * M)
2367 
2368 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2369 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2370 
2371 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2372 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2373 
2374 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2375 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2376 
2377 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2378 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2379 
2380 #undef DO_MLA
2381 #undef DO_MLS
2382 #undef DO_ZPZZZ
2383 #undef DO_ZPZZZ_D
2384 
2385 void HELPER(sve_index_b)(void *vd, uint32_t start,
2386                          uint32_t incr, uint32_t desc)
2387 {
2388     intptr_t i, opr_sz = simd_oprsz(desc);
2389     uint8_t *d = vd;
2390     for (i = 0; i < opr_sz; i += 1) {
2391         d[H1(i)] = start + i * incr;
2392     }
2393 }
2394 
2395 void HELPER(sve_index_h)(void *vd, uint32_t start,
2396                          uint32_t incr, uint32_t desc)
2397 {
2398     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2399     uint16_t *d = vd;
2400     for (i = 0; i < opr_sz; i += 1) {
2401         d[H2(i)] = start + i * incr;
2402     }
2403 }
2404 
2405 void HELPER(sve_index_s)(void *vd, uint32_t start,
2406                          uint32_t incr, uint32_t desc)
2407 {
2408     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2409     uint32_t *d = vd;
2410     for (i = 0; i < opr_sz; i += 1) {
2411         d[H4(i)] = start + i * incr;
2412     }
2413 }
2414 
2415 void HELPER(sve_index_d)(void *vd, uint64_t start,
2416                          uint64_t incr, uint32_t desc)
2417 {
2418     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2419     uint64_t *d = vd;
2420     for (i = 0; i < opr_sz; i += 1) {
2421         d[i] = start + i * incr;
2422     }
2423 }
2424 
2425 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2426 {
2427     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2428     uint32_t sh = simd_data(desc);
2429     uint32_t *d = vd, *n = vn, *m = vm;
2430     for (i = 0; i < opr_sz; i += 1) {
2431         d[i] = n[i] + (m[i] << sh);
2432     }
2433 }
2434 
2435 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2436 {
2437     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2438     uint64_t sh = simd_data(desc);
2439     uint64_t *d = vd, *n = vn, *m = vm;
2440     for (i = 0; i < opr_sz; i += 1) {
2441         d[i] = n[i] + (m[i] << sh);
2442     }
2443 }
2444 
2445 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2446 {
2447     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2448     uint64_t sh = simd_data(desc);
2449     uint64_t *d = vd, *n = vn, *m = vm;
2450     for (i = 0; i < opr_sz; i += 1) {
2451         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2452     }
2453 }
2454 
2455 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2456 {
2457     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2458     uint64_t sh = simd_data(desc);
2459     uint64_t *d = vd, *n = vn, *m = vm;
2460     for (i = 0; i < opr_sz; i += 1) {
2461         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2462     }
2463 }
2464 
2465 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2466 {
2467     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2468     static const uint16_t coeff[] = {
2469         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2470         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2471         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2472         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2473     };
2474     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2475     uint16_t *d = vd, *n = vn;
2476 
2477     for (i = 0; i < opr_sz; i++) {
2478         uint16_t nn = n[i];
2479         intptr_t idx = extract32(nn, 0, 5);
2480         uint16_t exp = extract32(nn, 5, 5);
2481         d[i] = coeff[idx] | (exp << 10);
2482     }
2483 }
2484 
2485 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2486 {
2487     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2488     static const uint32_t coeff[] = {
2489         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2490         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2491         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2492         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2493         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2494         0x1ef532, 0x20b051, 0x227043, 0x243516,
2495         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2496         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2497         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2498         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2499         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2500         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2501         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2502         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2503         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2504         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2505     };
2506     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2507     uint32_t *d = vd, *n = vn;
2508 
2509     for (i = 0; i < opr_sz; i++) {
2510         uint32_t nn = n[i];
2511         intptr_t idx = extract32(nn, 0, 6);
2512         uint32_t exp = extract32(nn, 6, 8);
2513         d[i] = coeff[idx] | (exp << 23);
2514     }
2515 }
2516 
2517 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2518 {
2519     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2520     static const uint64_t coeff[] = {
2521         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2522         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2523         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2524         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2525         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2526         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2527         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2528         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2529         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2530         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2531         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2532         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2533         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2534         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2535         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2536         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2537         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2538         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2539         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2540         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2541         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2542         0xFA7C1819E90D8ull,
2543     };
2544     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2545     uint64_t *d = vd, *n = vn;
2546 
2547     for (i = 0; i < opr_sz; i++) {
2548         uint64_t nn = n[i];
2549         intptr_t idx = extract32(nn, 0, 6);
2550         uint64_t exp = extract32(nn, 6, 11);
2551         d[i] = coeff[idx] | (exp << 52);
2552     }
2553 }
2554 
2555 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2556 {
2557     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2558     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2559     uint16_t *d = vd, *n = vn, *m = vm;
2560     for (i = 0; i < opr_sz; i += 1) {
2561         uint16_t nn = n[i];
2562         uint16_t mm = m[i];
2563         if (mm & 1) {
2564             nn = float16_one;
2565         }
2566         if (mm & 2) {
2567             nn = float16_maybe_ah_chs(nn, fpcr_ah);
2568         }
2569         d[i] = nn;
2570     }
2571 }
2572 
2573 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2574 {
2575     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2576     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2577     uint32_t *d = vd, *n = vn, *m = vm;
2578     for (i = 0; i < opr_sz; i += 1) {
2579         uint32_t nn = n[i];
2580         uint32_t mm = m[i];
2581         if (mm & 1) {
2582             nn = float32_one;
2583         }
2584         if (mm & 2) {
2585             nn = float32_maybe_ah_chs(nn, fpcr_ah);
2586         }
2587         d[i] = nn;
2588     }
2589 }
2590 
2591 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2592 {
2593     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2594     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1);
2595     uint64_t *d = vd, *n = vn, *m = vm;
2596     for (i = 0; i < opr_sz; i += 1) {
2597         uint64_t nn = n[i];
2598         uint64_t mm = m[i];
2599         if (mm & 1) {
2600             nn = float64_one;
2601         }
2602         if (mm & 2) {
2603             nn = float64_maybe_ah_chs(nn, fpcr_ah);
2604         }
2605         d[i] = nn;
2606     }
2607 }
2608 
2609 /*
2610  * Signed saturating addition with scalar operand.
2611  */
2612 
2613 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2614 {
2615     intptr_t i, oprsz = simd_oprsz(desc);
2616 
2617     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2618         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2619     }
2620 }
2621 
2622 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2623 {
2624     intptr_t i, oprsz = simd_oprsz(desc);
2625 
2626     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2627         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2628     }
2629 }
2630 
2631 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2632 {
2633     intptr_t i, oprsz = simd_oprsz(desc);
2634 
2635     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2636         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2637     }
2638 }
2639 
2640 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2641 {
2642     intptr_t i, oprsz = simd_oprsz(desc);
2643 
2644     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2645         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2646     }
2647 }
2648 
2649 /*
2650  * Unsigned saturating addition with scalar operand.
2651  */
2652 
2653 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2654 {
2655     intptr_t i, oprsz = simd_oprsz(desc);
2656 
2657     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2658         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2659     }
2660 }
2661 
2662 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2663 {
2664     intptr_t i, oprsz = simd_oprsz(desc);
2665 
2666     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2667         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2668     }
2669 }
2670 
2671 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2672 {
2673     intptr_t i, oprsz = simd_oprsz(desc);
2674 
2675     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2676         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2677     }
2678 }
2679 
2680 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2681 {
2682     intptr_t i, oprsz = simd_oprsz(desc);
2683 
2684     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2685         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2686     }
2687 }
2688 
2689 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2690 {
2691     intptr_t i, oprsz = simd_oprsz(desc);
2692 
2693     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2694         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2695     }
2696 }
2697 
2698 /* Two operand predicated copy immediate with merge.  All valid immediates
2699  * can fit within 17 signed bits in the simd_data field.
2700  */
2701 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2702                          uint64_t mm, uint32_t desc)
2703 {
2704     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2705     uint64_t *d = vd, *n = vn;
2706     uint8_t *pg = vg;
2707 
2708     mm = dup_const(MO_8, mm);
2709     for (i = 0; i < opr_sz; i += 1) {
2710         uint64_t nn = n[i];
2711         uint64_t pp = expand_pred_b(pg[H1(i)]);
2712         d[i] = (mm & pp) | (nn & ~pp);
2713     }
2714 }
2715 
2716 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2717                          uint64_t mm, uint32_t desc)
2718 {
2719     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2720     uint64_t *d = vd, *n = vn;
2721     uint8_t *pg = vg;
2722 
2723     mm = dup_const(MO_16, mm);
2724     for (i = 0; i < opr_sz; i += 1) {
2725         uint64_t nn = n[i];
2726         uint64_t pp = expand_pred_h(pg[H1(i)]);
2727         d[i] = (mm & pp) | (nn & ~pp);
2728     }
2729 }
2730 
2731 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2732                          uint64_t mm, uint32_t desc)
2733 {
2734     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2735     uint64_t *d = vd, *n = vn;
2736     uint8_t *pg = vg;
2737 
2738     mm = dup_const(MO_32, mm);
2739     for (i = 0; i < opr_sz; i += 1) {
2740         uint64_t nn = n[i];
2741         uint64_t pp = expand_pred_s(pg[H1(i)]);
2742         d[i] = (mm & pp) | (nn & ~pp);
2743     }
2744 }
2745 
2746 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2747                          uint64_t mm, uint32_t desc)
2748 {
2749     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2750     uint64_t *d = vd, *n = vn;
2751     uint8_t *pg = vg;
2752 
2753     for (i = 0; i < opr_sz; i += 1) {
2754         uint64_t nn = n[i];
2755         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2756     }
2757 }
2758 
2759 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2760 {
2761     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2762     uint64_t *d = vd;
2763     uint8_t *pg = vg;
2764 
2765     val = dup_const(MO_8, val);
2766     for (i = 0; i < opr_sz; i += 1) {
2767         d[i] = val & expand_pred_b(pg[H1(i)]);
2768     }
2769 }
2770 
2771 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2772 {
2773     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2774     uint64_t *d = vd;
2775     uint8_t *pg = vg;
2776 
2777     val = dup_const(MO_16, val);
2778     for (i = 0; i < opr_sz; i += 1) {
2779         d[i] = val & expand_pred_h(pg[H1(i)]);
2780     }
2781 }
2782 
2783 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2784 {
2785     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2786     uint64_t *d = vd;
2787     uint8_t *pg = vg;
2788 
2789     val = dup_const(MO_32, val);
2790     for (i = 0; i < opr_sz; i += 1) {
2791         d[i] = val & expand_pred_s(pg[H1(i)]);
2792     }
2793 }
2794 
2795 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2796 {
2797     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2798     uint64_t *d = vd;
2799     uint8_t *pg = vg;
2800 
2801     for (i = 0; i < opr_sz; i += 1) {
2802         d[i] = (pg[H1(i)] & 1 ? val : 0);
2803     }
2804 }
2805 
2806 /* Big-endian hosts need to frob the byte indices.  If the copy
2807  * happens to be 8-byte aligned, then no frobbing necessary.
2808  */
2809 static void swap_memmove(void *vd, void *vs, size_t n)
2810 {
2811     uintptr_t d = (uintptr_t)vd;
2812     uintptr_t s = (uintptr_t)vs;
2813     uintptr_t o = (d | s | n) & 7;
2814     size_t i;
2815 
2816 #if !HOST_BIG_ENDIAN
2817     o = 0;
2818 #endif
2819     switch (o) {
2820     case 0:
2821         memmove(vd, vs, n);
2822         break;
2823 
2824     case 4:
2825         if (d < s || d >= s + n) {
2826             for (i = 0; i < n; i += 4) {
2827                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2828             }
2829         } else {
2830             for (i = n; i > 0; ) {
2831                 i -= 4;
2832                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2833             }
2834         }
2835         break;
2836 
2837     case 2:
2838     case 6:
2839         if (d < s || d >= s + n) {
2840             for (i = 0; i < n; i += 2) {
2841                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2842             }
2843         } else {
2844             for (i = n; i > 0; ) {
2845                 i -= 2;
2846                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2847             }
2848         }
2849         break;
2850 
2851     default:
2852         if (d < s || d >= s + n) {
2853             for (i = 0; i < n; i++) {
2854                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2855             }
2856         } else {
2857             for (i = n; i > 0; ) {
2858                 i -= 1;
2859                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2860             }
2861         }
2862         break;
2863     }
2864 }
2865 
2866 /* Similarly for memset of 0.  */
2867 static void swap_memzero(void *vd, size_t n)
2868 {
2869     uintptr_t d = (uintptr_t)vd;
2870     uintptr_t o = (d | n) & 7;
2871     size_t i;
2872 
2873     /* Usually, the first bit of a predicate is set, so N is 0.  */
2874     if (likely(n == 0)) {
2875         return;
2876     }
2877 
2878 #if !HOST_BIG_ENDIAN
2879     o = 0;
2880 #endif
2881     switch (o) {
2882     case 0:
2883         memset(vd, 0, n);
2884         break;
2885 
2886     case 4:
2887         for (i = 0; i < n; i += 4) {
2888             *(uint32_t *)H1_4(d + i) = 0;
2889         }
2890         break;
2891 
2892     case 2:
2893     case 6:
2894         for (i = 0; i < n; i += 2) {
2895             *(uint16_t *)H1_2(d + i) = 0;
2896         }
2897         break;
2898 
2899     default:
2900         for (i = 0; i < n; i++) {
2901             *(uint8_t *)H1(d + i) = 0;
2902         }
2903         break;
2904     }
2905 }
2906 
2907 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2908 {
2909     intptr_t opr_sz = simd_oprsz(desc);
2910     size_t n_ofs = simd_data(desc);
2911     size_t n_siz = opr_sz - n_ofs;
2912 
2913     if (vd != vm) {
2914         swap_memmove(vd, vn + n_ofs, n_siz);
2915         swap_memmove(vd + n_siz, vm, n_ofs);
2916     } else if (vd != vn) {
2917         swap_memmove(vd + n_siz, vd, n_ofs);
2918         swap_memmove(vd, vn + n_ofs, n_siz);
2919     } else {
2920         /* vd == vn == vm.  Need temp space.  */
2921         ARMVectorReg tmp;
2922         swap_memmove(&tmp, vm, n_ofs);
2923         swap_memmove(vd, vd + n_ofs, n_siz);
2924         memcpy(vd + n_siz, &tmp, n_ofs);
2925     }
2926 }
2927 
2928 #define DO_INSR(NAME, TYPE, H) \
2929 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2930 {                                                                  \
2931     intptr_t opr_sz = simd_oprsz(desc);                            \
2932     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2933     *(TYPE *)(vd + H(0)) = val;                                    \
2934 }
2935 
2936 DO_INSR(sve_insr_b, uint8_t, H1)
2937 DO_INSR(sve_insr_h, uint16_t, H1_2)
2938 DO_INSR(sve_insr_s, uint32_t, H1_4)
2939 DO_INSR(sve_insr_d, uint64_t, H1_8)
2940 
2941 #undef DO_INSR
2942 
2943 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2944 {
2945     intptr_t i, j, opr_sz = simd_oprsz(desc);
2946     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2947         uint64_t f = *(uint64_t *)(vn + i);
2948         uint64_t b = *(uint64_t *)(vn + j);
2949         *(uint64_t *)(vd + i) = bswap64(b);
2950         *(uint64_t *)(vd + j) = bswap64(f);
2951     }
2952 }
2953 
2954 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2955 {
2956     intptr_t i, j, opr_sz = simd_oprsz(desc);
2957     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2958         uint64_t f = *(uint64_t *)(vn + i);
2959         uint64_t b = *(uint64_t *)(vn + j);
2960         *(uint64_t *)(vd + i) = hswap64(b);
2961         *(uint64_t *)(vd + j) = hswap64(f);
2962     }
2963 }
2964 
2965 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2966 {
2967     intptr_t i, j, opr_sz = simd_oprsz(desc);
2968     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2969         uint64_t f = *(uint64_t *)(vn + i);
2970         uint64_t b = *(uint64_t *)(vn + j);
2971         *(uint64_t *)(vd + i) = rol64(b, 32);
2972         *(uint64_t *)(vd + j) = rol64(f, 32);
2973     }
2974 }
2975 
2976 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2977 {
2978     intptr_t i, j, opr_sz = simd_oprsz(desc);
2979     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2980         uint64_t f = *(uint64_t *)(vn + i);
2981         uint64_t b = *(uint64_t *)(vn + j);
2982         *(uint64_t *)(vd + i) = b;
2983         *(uint64_t *)(vd + j) = f;
2984     }
2985 }
2986 
2987 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2988 
2989 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2990                            bool is_tbx, tb_impl_fn *fn)
2991 {
2992     ARMVectorReg scratch;
2993     uintptr_t oprsz = simd_oprsz(desc);
2994 
2995     if (unlikely(vd == vn)) {
2996         vn = memcpy(&scratch, vn, oprsz);
2997     }
2998 
2999     fn(vd, vn, NULL, vm, oprsz, is_tbx);
3000 }
3001 
3002 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
3003                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
3004 {
3005     ARMVectorReg scratch;
3006     uintptr_t oprsz = simd_oprsz(desc);
3007 
3008     if (unlikely(vd == vn0)) {
3009         vn0 = memcpy(&scratch, vn0, oprsz);
3010         if (vd == vn1) {
3011             vn1 = vn0;
3012         }
3013     } else if (unlikely(vd == vn1)) {
3014         vn1 = memcpy(&scratch, vn1, oprsz);
3015     }
3016 
3017     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
3018 }
3019 
3020 #define DO_TB(SUFF, TYPE, H)                                            \
3021 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
3022                                 void *vm, uintptr_t oprsz, bool is_tbx) \
3023 {                                                                       \
3024     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
3025     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
3026     for (i = 0; i < nelem; ++i) {                                       \
3027         TYPE index = indexes[H1(i)], val = 0;                           \
3028         if (index < nelem) {                                            \
3029             val = tbl0[H(index)];                                       \
3030         } else {                                                        \
3031             index -= nelem;                                             \
3032             if (tbl1 && index < nelem) {                                \
3033                 val = tbl1[H(index)];                                   \
3034             } else if (is_tbx) {                                        \
3035                 continue;                                               \
3036             }                                                           \
3037         }                                                               \
3038         d[H(i)] = val;                                                  \
3039     }                                                                   \
3040 }                                                                       \
3041 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3042 {                                                                       \
3043     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3044 }                                                                       \
3045 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3046                              void *vm, uint32_t desc)                   \
3047 {                                                                       \
3048     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3049 }                                                                       \
3050 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3051 {                                                                       \
3052     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3053 }
3054 
3055 DO_TB(b, uint8_t, H1)
3056 DO_TB(h, uint16_t, H2)
3057 DO_TB(s, uint32_t, H4)
3058 DO_TB(d, uint64_t, H8)
3059 
3060 #undef DO_TB
3061 
3062 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3063 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3064 {                                                              \
3065     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3066     TYPED *d = vd;                                             \
3067     TYPES *n = vn;                                             \
3068     ARMVectorReg tmp;                                          \
3069     if (unlikely(vn - vd < opr_sz)) {                          \
3070         n = memcpy(&tmp, n, opr_sz / 2);                       \
3071     }                                                          \
3072     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3073         d[HD(i)] = n[HS(i)];                                   \
3074     }                                                          \
3075 }
3076 
3077 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3078 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3079 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3080 
3081 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3082 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3083 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3084 
3085 #undef DO_UNPK
3086 
3087 /* Mask of bits included in the even numbered predicates of width esz.
3088  * We also use this for expand_bits/compress_bits, and so extend the
3089  * same pattern out to 16-bit units.
3090  */
3091 static const uint64_t even_bit_esz_masks[5] = {
3092     0x5555555555555555ull,
3093     0x3333333333333333ull,
3094     0x0f0f0f0f0f0f0f0full,
3095     0x00ff00ff00ff00ffull,
3096     0x0000ffff0000ffffull,
3097 };
3098 
3099 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3100  * For N==0, this corresponds to the operation that in qemu/bitops.h
3101  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3102  * section 7-2 Shuffling Bits.
3103  */
3104 static uint64_t expand_bits(uint64_t x, int n)
3105 {
3106     int i;
3107 
3108     x &= 0xffffffffu;
3109     for (i = 4; i >= n; i--) {
3110         int sh = 1 << i;
3111         x = ((x << sh) | x) & even_bit_esz_masks[i];
3112     }
3113     return x;
3114 }
3115 
3116 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3117  * For N==0, this corresponds to the operation that in qemu/bitops.h
3118  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3119  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3120  */
3121 static uint64_t compress_bits(uint64_t x, int n)
3122 {
3123     int i;
3124 
3125     for (i = n; i <= 4; i++) {
3126         int sh = 1 << i;
3127         x &= even_bit_esz_masks[i];
3128         x = (x >> sh) | x;
3129     }
3130     return x & 0xffffffffu;
3131 }
3132 
3133 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3134 {
3135     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3136     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3137     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3138     int esize = 1 << esz;
3139     uint64_t *d = vd;
3140     intptr_t i;
3141 
3142     if (oprsz <= 8) {
3143         uint64_t nn = *(uint64_t *)vn;
3144         uint64_t mm = *(uint64_t *)vm;
3145         int half = 4 * oprsz;
3146 
3147         nn = extract64(nn, high * half, half);
3148         mm = extract64(mm, high * half, half);
3149         nn = expand_bits(nn, esz);
3150         mm = expand_bits(mm, esz);
3151         d[0] = nn | (mm << esize);
3152     } else {
3153         ARMPredicateReg tmp;
3154 
3155         /* We produce output faster than we consume input.
3156            Therefore we must be mindful of possible overlap.  */
3157         if (vd == vn) {
3158             vn = memcpy(&tmp, vn, oprsz);
3159             if (vd == vm) {
3160                 vm = vn;
3161             }
3162         } else if (vd == vm) {
3163             vm = memcpy(&tmp, vm, oprsz);
3164         }
3165         if (high) {
3166             high = oprsz >> 1;
3167         }
3168 
3169         if ((oprsz & 7) == 0) {
3170             uint32_t *n = vn, *m = vm;
3171             high >>= 2;
3172 
3173             for (i = 0; i < oprsz / 8; i++) {
3174                 uint64_t nn = n[H4(high + i)];
3175                 uint64_t mm = m[H4(high + i)];
3176 
3177                 nn = expand_bits(nn, esz);
3178                 mm = expand_bits(mm, esz);
3179                 d[i] = nn | (mm << esize);
3180             }
3181         } else {
3182             uint8_t *n = vn, *m = vm;
3183             uint16_t *d16 = vd;
3184 
3185             for (i = 0; i < oprsz / 2; i++) {
3186                 uint16_t nn = n[H1(high + i)];
3187                 uint16_t mm = m[H1(high + i)];
3188 
3189                 nn = expand_bits(nn, esz);
3190                 mm = expand_bits(mm, esz);
3191                 d16[H2(i)] = nn | (mm << esize);
3192             }
3193         }
3194     }
3195 }
3196 
3197 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3198 {
3199     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3200     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3201     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3202     uint64_t *d = vd, *n = vn, *m = vm;
3203     uint64_t l, h;
3204     intptr_t i;
3205 
3206     if (oprsz <= 8) {
3207         l = compress_bits(n[0] >> odd, esz);
3208         h = compress_bits(m[0] >> odd, esz);
3209         d[0] = l | (h << (4 * oprsz));
3210     } else {
3211         ARMPredicateReg tmp_m;
3212         intptr_t oprsz_16 = oprsz / 16;
3213 
3214         if ((vm - vd) < (uintptr_t)oprsz) {
3215             m = memcpy(&tmp_m, vm, oprsz);
3216         }
3217 
3218         for (i = 0; i < oprsz_16; i++) {
3219             l = n[2 * i + 0];
3220             h = n[2 * i + 1];
3221             l = compress_bits(l >> odd, esz);
3222             h = compress_bits(h >> odd, esz);
3223             d[i] = l | (h << 32);
3224         }
3225 
3226         /*
3227          * For VL which is not a multiple of 512, the results from M do not
3228          * align nicely with the uint64_t for D.  Put the aligned results
3229          * from M into TMP_M and then copy it into place afterward.
3230          */
3231         if (oprsz & 15) {
3232             int final_shift = (oprsz & 15) * 2;
3233 
3234             l = n[2 * i + 0];
3235             h = n[2 * i + 1];
3236             l = compress_bits(l >> odd, esz);
3237             h = compress_bits(h >> odd, esz);
3238             d[i] = l | (h << final_shift);
3239 
3240             for (i = 0; i < oprsz_16; i++) {
3241                 l = m[2 * i + 0];
3242                 h = m[2 * i + 1];
3243                 l = compress_bits(l >> odd, esz);
3244                 h = compress_bits(h >> odd, esz);
3245                 tmp_m.p[i] = l | (h << 32);
3246             }
3247             l = m[2 * i + 0];
3248             h = m[2 * i + 1];
3249             l = compress_bits(l >> odd, esz);
3250             h = compress_bits(h >> odd, esz);
3251             tmp_m.p[i] = l | (h << final_shift);
3252 
3253             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3254         } else {
3255             for (i = 0; i < oprsz_16; i++) {
3256                 l = m[2 * i + 0];
3257                 h = m[2 * i + 1];
3258                 l = compress_bits(l >> odd, esz);
3259                 h = compress_bits(h >> odd, esz);
3260                 d[oprsz_16 + i] = l | (h << 32);
3261             }
3262         }
3263     }
3264 }
3265 
3266 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3267 {
3268     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3269     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3270     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3271     uint64_t *d = vd, *n = vn, *m = vm;
3272     uint64_t mask;
3273     int shr, shl;
3274     intptr_t i;
3275 
3276     shl = 1 << esz;
3277     shr = 0;
3278     mask = even_bit_esz_masks[esz];
3279     if (odd) {
3280         mask <<= shl;
3281         shr = shl;
3282         shl = 0;
3283     }
3284 
3285     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3286         uint64_t nn = (n[i] & mask) >> shr;
3287         uint64_t mm = (m[i] & mask) << shl;
3288         d[i] = nn + mm;
3289     }
3290 }
3291 
3292 /* Reverse units of 2**N bits.  */
3293 static uint64_t reverse_bits_64(uint64_t x, int n)
3294 {
3295     int i, sh;
3296 
3297     x = bswap64(x);
3298     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3299         uint64_t mask = even_bit_esz_masks[i];
3300         x = ((x & mask) << sh) | ((x >> sh) & mask);
3301     }
3302     return x;
3303 }
3304 
3305 static uint8_t reverse_bits_8(uint8_t x, int n)
3306 {
3307     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3308     int i, sh;
3309 
3310     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3311         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3312     }
3313     return x;
3314 }
3315 
3316 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3317 {
3318     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3319     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3320     intptr_t i, oprsz_2 = oprsz / 2;
3321 
3322     if (oprsz <= 8) {
3323         uint64_t l = *(uint64_t *)vn;
3324         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3325         *(uint64_t *)vd = l;
3326     } else if ((oprsz & 15) == 0) {
3327         for (i = 0; i < oprsz_2; i += 8) {
3328             intptr_t ih = oprsz - 8 - i;
3329             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3330             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3331             *(uint64_t *)(vd + i) = h;
3332             *(uint64_t *)(vd + ih) = l;
3333         }
3334     } else {
3335         for (i = 0; i < oprsz_2; i += 1) {
3336             intptr_t il = H1(i);
3337             intptr_t ih = H1(oprsz - 1 - i);
3338             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3339             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3340             *(uint8_t *)(vd + il) = h;
3341             *(uint8_t *)(vd + ih) = l;
3342         }
3343     }
3344 }
3345 
3346 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3347 {
3348     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3349     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3350     uint64_t *d = vd;
3351     intptr_t i;
3352 
3353     if (oprsz <= 8) {
3354         uint64_t nn = *(uint64_t *)vn;
3355         int half = 4 * oprsz;
3356 
3357         nn = extract64(nn, high * half, half);
3358         nn = expand_bits(nn, 0);
3359         d[0] = nn;
3360     } else {
3361         ARMPredicateReg tmp_n;
3362 
3363         /* We produce output faster than we consume input.
3364            Therefore we must be mindful of possible overlap.  */
3365         if ((vn - vd) < (uintptr_t)oprsz) {
3366             vn = memcpy(&tmp_n, vn, oprsz);
3367         }
3368         if (high) {
3369             high = oprsz >> 1;
3370         }
3371 
3372         if ((oprsz & 7) == 0) {
3373             uint32_t *n = vn;
3374             high >>= 2;
3375 
3376             for (i = 0; i < oprsz / 8; i++) {
3377                 uint64_t nn = n[H4(high + i)];
3378                 d[i] = expand_bits(nn, 0);
3379             }
3380         } else {
3381             uint16_t *d16 = vd;
3382             uint8_t *n = vn;
3383 
3384             for (i = 0; i < oprsz / 2; i++) {
3385                 uint16_t nn = n[H1(high + i)];
3386                 d16[H2(i)] = expand_bits(nn, 0);
3387             }
3388         }
3389     }
3390 }
3391 
3392 #define DO_ZIP(NAME, TYPE, H) \
3393 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3394 {                                                                    \
3395     intptr_t oprsz = simd_oprsz(desc);                               \
3396     intptr_t odd_ofs = simd_data(desc);                              \
3397     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3398     ARMVectorReg tmp_n, tmp_m;                                       \
3399     /* We produce output faster than we consume input.               \
3400        Therefore we must be mindful of possible overlap.  */         \
3401     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3402         vn = memcpy(&tmp_n, vn, oprsz);                              \
3403     }                                                                \
3404     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3405         vm = memcpy(&tmp_m, vm, oprsz);                              \
3406     }                                                                \
3407     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3408         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3409         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3410             *(TYPE *)(vm + odd_ofs + H(i));                          \
3411     }                                                                \
3412     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3413         memset(vd + oprsz - 16, 0, 16);                              \
3414     }                                                                \
3415 }
3416 
3417 DO_ZIP(sve_zip_b, uint8_t, H1)
3418 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3419 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3420 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3421 DO_ZIP(sve2_zip_q, Int128, )
3422 
3423 #define DO_UZP(NAME, TYPE, H) \
3424 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3425 {                                                                      \
3426     intptr_t oprsz = simd_oprsz(desc);                                 \
3427     intptr_t odd_ofs = simd_data(desc);                                \
3428     intptr_t i, p;                                                     \
3429     ARMVectorReg tmp_m;                                                \
3430     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3431         vm = memcpy(&tmp_m, vm, oprsz);                                \
3432     }                                                                  \
3433     i = 0, p = odd_ofs;                                                \
3434     do {                                                               \
3435         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3436         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3437     } while (p < oprsz);                                               \
3438     p -= oprsz;                                                        \
3439     do {                                                               \
3440         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3441         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3442     } while (p < oprsz);                                               \
3443     tcg_debug_assert(i == oprsz);                                      \
3444 }
3445 
3446 DO_UZP(sve_uzp_b, uint8_t, H1)
3447 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3448 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3449 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3450 DO_UZP(sve2_uzp_q, Int128, )
3451 
3452 #define DO_TRN(NAME, TYPE, H) \
3453 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3454 {                                                                      \
3455     intptr_t oprsz = simd_oprsz(desc);                                 \
3456     intptr_t odd_ofs = simd_data(desc);                                \
3457     intptr_t i;                                                        \
3458     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3459         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3460         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3461         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3462         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3463     }                                                                  \
3464     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3465         memset(vd + oprsz - 16, 0, 16);                                \
3466     }                                                                  \
3467 }
3468 
3469 DO_TRN(sve_trn_b, uint8_t, H1)
3470 DO_TRN(sve_trn_h, uint16_t, H1_2)
3471 DO_TRN(sve_trn_s, uint32_t, H1_4)
3472 DO_TRN(sve_trn_d, uint64_t, H1_8)
3473 DO_TRN(sve2_trn_q, Int128, )
3474 
3475 #undef DO_ZIP
3476 #undef DO_UZP
3477 #undef DO_TRN
3478 
3479 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3480 {
3481     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3482     uint32_t *d = vd, *n = vn;
3483     uint8_t *pg = vg;
3484 
3485     for (i = j = 0; i < opr_sz; i++) {
3486         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3487             d[H4(j)] = n[H4(i)];
3488             j++;
3489         }
3490     }
3491     for (; j < opr_sz; j++) {
3492         d[H4(j)] = 0;
3493     }
3494 }
3495 
3496 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3497 {
3498     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3499     uint64_t *d = vd, *n = vn;
3500     uint8_t *pg = vg;
3501 
3502     for (i = j = 0; i < opr_sz; i++) {
3503         if (pg[H1(i)] & 1) {
3504             d[j] = n[i];
3505             j++;
3506         }
3507     }
3508     for (; j < opr_sz; j++) {
3509         d[j] = 0;
3510     }
3511 }
3512 
3513 /* Similar to the ARM LastActiveElement pseudocode function, except the
3514  * result is multiplied by the element size.  This includes the not found
3515  * indication; e.g. not found for esz=3 is -8.
3516  */
3517 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3518 {
3519     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3520     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3521 
3522     return last_active_element(vg, words, esz);
3523 }
3524 
3525 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3526 {
3527     intptr_t opr_sz = simd_oprsz(desc) / 8;
3528     int esz = simd_data(desc);
3529     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3530     intptr_t i, first_i, last_i;
3531     ARMVectorReg tmp;
3532 
3533     first_i = last_i = 0;
3534     first_g = last_g = 0;
3535 
3536     /* Find the extent of the active elements within VG.  */
3537     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3538         pg = *(uint64_t *)(vg + i) & mask;
3539         if (pg) {
3540             if (last_g == 0) {
3541                 last_g = pg;
3542                 last_i = i;
3543             }
3544             first_g = pg;
3545             first_i = i;
3546         }
3547     }
3548 
3549     len = 0;
3550     if (first_g != 0) {
3551         first_i = first_i * 8 + ctz64(first_g);
3552         last_i = last_i * 8 + 63 - clz64(last_g);
3553         len = last_i - first_i + (1 << esz);
3554         if (vd == vm) {
3555             vm = memcpy(&tmp, vm, opr_sz * 8);
3556         }
3557         swap_memmove(vd, vn + first_i, len);
3558     }
3559     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3560 }
3561 
3562 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3563                             void *vg, uint32_t desc)
3564 {
3565     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3566     uint64_t *d = vd, *n = vn, *m = vm;
3567     uint8_t *pg = vg;
3568 
3569     for (i = 0; i < opr_sz; i += 1) {
3570         uint64_t nn = n[i], mm = m[i];
3571         uint64_t pp = expand_pred_b(pg[H1(i)]);
3572         d[i] = (nn & pp) | (mm & ~pp);
3573     }
3574 }
3575 
3576 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3577                             void *vg, uint32_t desc)
3578 {
3579     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3580     uint64_t *d = vd, *n = vn, *m = vm;
3581     uint8_t *pg = vg;
3582 
3583     for (i = 0; i < opr_sz; i += 1) {
3584         uint64_t nn = n[i], mm = m[i];
3585         uint64_t pp = expand_pred_h(pg[H1(i)]);
3586         d[i] = (nn & pp) | (mm & ~pp);
3587     }
3588 }
3589 
3590 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3591                             void *vg, uint32_t desc)
3592 {
3593     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3594     uint64_t *d = vd, *n = vn, *m = vm;
3595     uint8_t *pg = vg;
3596 
3597     for (i = 0; i < opr_sz; i += 1) {
3598         uint64_t nn = n[i], mm = m[i];
3599         uint64_t pp = expand_pred_s(pg[H1(i)]);
3600         d[i] = (nn & pp) | (mm & ~pp);
3601     }
3602 }
3603 
3604 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3605                             void *vg, uint32_t desc)
3606 {
3607     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3608     uint64_t *d = vd, *n = vn, *m = vm;
3609     uint8_t *pg = vg;
3610 
3611     for (i = 0; i < opr_sz; i += 1) {
3612         uint64_t nn = n[i], mm = m[i];
3613         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3614     }
3615 }
3616 
3617 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3618                             void *vg, uint32_t desc)
3619 {
3620     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3621     Int128 *d = vd, *n = vn, *m = vm;
3622     uint16_t *pg = vg;
3623 
3624     for (i = 0; i < opr_sz; i += 1) {
3625         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3626     }
3627 }
3628 
3629 /* Two operand comparison controlled by a predicate.
3630  * ??? It is very tempting to want to be able to expand this inline
3631  * with x86 instructions, e.g.
3632  *
3633  *    vcmpeqw    zm, zn, %ymm0
3634  *    vpmovmskb  %ymm0, %eax
3635  *    and        $0x5555, %eax
3636  *    and        pg, %eax
3637  *
3638  * or even aarch64, e.g.
3639  *
3640  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3641  *    cmeq       v0.8h, zn, zm
3642  *    and        v0.8h, v0.8h, mask
3643  *    addv       h0, v0.8h
3644  *    and        v0.8b, pg
3645  *
3646  * However, coming up with an abstraction that allows vector inputs and
3647  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3648  * scalar outputs, is tricky.
3649  */
3650 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3651 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3652 {                                                                            \
3653     intptr_t opr_sz = simd_oprsz(desc);                                      \
3654     uint32_t flags = PREDTEST_INIT;                                          \
3655     intptr_t i = opr_sz;                                                     \
3656     do {                                                                     \
3657         uint64_t out = 0, pg;                                                \
3658         do {                                                                 \
3659             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3660             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3661             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3662             out |= nn OP mm;                                                 \
3663         } while (i & 63);                                                    \
3664         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3665         out &= pg;                                                           \
3666         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3667         flags = iter_predtest_bwd(out, pg, flags);                           \
3668     } while (i > 0);                                                         \
3669     return flags;                                                            \
3670 }
3671 
3672 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3673     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3674 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3675     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3676 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3677     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3678 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3679     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3680 
3681 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3682 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3683 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3684 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3685 
3686 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3687 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3688 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3689 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3690 
3691 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3692 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3693 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3694 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3695 
3696 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3697 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3698 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3699 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3700 
3701 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3702 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3703 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3704 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3705 
3706 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3707 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3708 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3709 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3710 
3711 #undef DO_CMP_PPZZ_B
3712 #undef DO_CMP_PPZZ_H
3713 #undef DO_CMP_PPZZ_S
3714 #undef DO_CMP_PPZZ_D
3715 #undef DO_CMP_PPZZ
3716 
3717 /* Similar, but the second source is "wide".  */
3718 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3719 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3720 {                                                                            \
3721     intptr_t opr_sz = simd_oprsz(desc);                                      \
3722     uint32_t flags = PREDTEST_INIT;                                          \
3723     intptr_t i = opr_sz;                                                     \
3724     do {                                                                     \
3725         uint64_t out = 0, pg;                                                \
3726         do {                                                                 \
3727             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3728             do {                                                             \
3729                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3730                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3731                 out |= nn OP mm;                                             \
3732             } while (i & 7);                                                 \
3733         } while (i & 63);                                                    \
3734         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3735         out &= pg;                                                           \
3736         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3737         flags = iter_predtest_bwd(out, pg, flags);                           \
3738     } while (i > 0);                                                         \
3739     return flags;                                                            \
3740 }
3741 
3742 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3743     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3744 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3745     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3746 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3747     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3748 
3749 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3750 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3751 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3752 
3753 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3754 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3755 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3756 
3757 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3758 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3759 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3760 
3761 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3762 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3763 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3764 
3765 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3766 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3767 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3768 
3769 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3770 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3771 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3772 
3773 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3774 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3775 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3776 
3777 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3778 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3779 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3780 
3781 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3782 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3783 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3784 
3785 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3786 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3787 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3788 
3789 #undef DO_CMP_PPZW_B
3790 #undef DO_CMP_PPZW_H
3791 #undef DO_CMP_PPZW_S
3792 #undef DO_CMP_PPZW
3793 
3794 /* Similar, but the second source is immediate.  */
3795 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3796 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3797 {                                                                    \
3798     intptr_t opr_sz = simd_oprsz(desc);                              \
3799     uint32_t flags = PREDTEST_INIT;                                  \
3800     TYPE mm = simd_data(desc);                                       \
3801     intptr_t i = opr_sz;                                             \
3802     do {                                                             \
3803         uint64_t out = 0, pg;                                        \
3804         do {                                                         \
3805             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3806             TYPE nn = *(TYPE *)(vn + H(i));                          \
3807             out |= nn OP mm;                                         \
3808         } while (i & 63);                                            \
3809         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3810         out &= pg;                                                   \
3811         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3812         flags = iter_predtest_bwd(out, pg, flags);                   \
3813     } while (i > 0);                                                 \
3814     return flags;                                                    \
3815 }
3816 
3817 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3818     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3819 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3820     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3821 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3822     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3823 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3824     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3825 
3826 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3827 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3828 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3829 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3830 
3831 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3832 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3833 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3834 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3835 
3836 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3837 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3838 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3839 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3840 
3841 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3842 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3843 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3844 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3845 
3846 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3847 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3848 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3849 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3850 
3851 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3852 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3853 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3854 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3855 
3856 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3857 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3858 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3859 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3860 
3861 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3862 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3863 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3864 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3865 
3866 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3867 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3868 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3869 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3870 
3871 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3872 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3873 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3874 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3875 
3876 #undef DO_CMP_PPZI_B
3877 #undef DO_CMP_PPZI_H
3878 #undef DO_CMP_PPZI_S
3879 #undef DO_CMP_PPZI_D
3880 #undef DO_CMP_PPZI
3881 
3882 /* Similar to the ARM LastActive pseudocode function.  */
3883 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3884 {
3885     intptr_t i;
3886 
3887     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3888         uint64_t pg = *(uint64_t *)(vg + i);
3889         if (pg) {
3890             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3891         }
3892     }
3893     return 0;
3894 }
3895 
3896 /* Compute a mask into RETB that is true for all G, up to and including
3897  * (if after) or excluding (if !after) the first G & N.
3898  * Return true if BRK found.
3899  */
3900 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3901                         bool brk, bool after)
3902 {
3903     uint64_t b;
3904 
3905     if (brk) {
3906         b = 0;
3907     } else if ((g & n) == 0) {
3908         /* For all G, no N are set; break not found.  */
3909         b = g;
3910     } else {
3911         /* Break somewhere in N.  Locate it.  */
3912         b = g & n;            /* guard true, pred true */
3913         b = b & -b;           /* first such */
3914         if (after) {
3915             b = b | (b - 1);  /* break after same */
3916         } else {
3917             b = b - 1;        /* break before same */
3918         }
3919         brk = true;
3920     }
3921 
3922     *retb = b;
3923     return brk;
3924 }
3925 
3926 /* Compute a zeroing BRK.  */
3927 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3928                           intptr_t oprsz, bool after)
3929 {
3930     bool brk = false;
3931     intptr_t i;
3932 
3933     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3934         uint64_t this_b, this_g = g[i];
3935 
3936         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3937         d[i] = this_b & this_g;
3938     }
3939 }
3940 
3941 /* Likewise, but also compute flags.  */
3942 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3943                                intptr_t oprsz, bool after)
3944 {
3945     uint32_t flags = PREDTEST_INIT;
3946     bool brk = false;
3947     intptr_t i;
3948 
3949     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3950         uint64_t this_b, this_d, this_g = g[i];
3951 
3952         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3953         d[i] = this_d = this_b & this_g;
3954         flags = iter_predtest_fwd(this_d, this_g, flags);
3955     }
3956     return flags;
3957 }
3958 
3959 /* Compute a merging BRK.  */
3960 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3961                           intptr_t oprsz, bool after)
3962 {
3963     bool brk = false;
3964     intptr_t i;
3965 
3966     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3967         uint64_t this_b, this_g = g[i];
3968 
3969         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3970         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3971     }
3972 }
3973 
3974 /* Likewise, but also compute flags.  */
3975 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3976                                intptr_t oprsz, bool after)
3977 {
3978     uint32_t flags = PREDTEST_INIT;
3979     bool brk = false;
3980     intptr_t i;
3981 
3982     for (i = 0; i < oprsz / 8; ++i) {
3983         uint64_t this_b, this_d = d[i], this_g = g[i];
3984 
3985         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3986         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3987         flags = iter_predtest_fwd(this_d, this_g, flags);
3988     }
3989     return flags;
3990 }
3991 
3992 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3993 {
3994     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3995      * The compiler should turn this into 4 64-bit integer stores.
3996      */
3997     memset(d, 0, sizeof(ARMPredicateReg));
3998     return PREDTEST_INIT;
3999 }
4000 
4001 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
4002                        uint32_t pred_desc)
4003 {
4004     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4005     if (last_active_pred(vn, vg, oprsz)) {
4006         compute_brk_z(vd, vm, vg, oprsz, true);
4007     } else {
4008         do_zero(vd, oprsz);
4009     }
4010 }
4011 
4012 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
4013                             uint32_t pred_desc)
4014 {
4015     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4016     if (last_active_pred(vn, vg, oprsz)) {
4017         return compute_brks_z(vd, vm, vg, oprsz, true);
4018     } else {
4019         return do_zero(vd, oprsz);
4020     }
4021 }
4022 
4023 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
4024                        uint32_t pred_desc)
4025 {
4026     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4027     if (last_active_pred(vn, vg, oprsz)) {
4028         compute_brk_z(vd, vm, vg, oprsz, false);
4029     } else {
4030         do_zero(vd, oprsz);
4031     }
4032 }
4033 
4034 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4035                             uint32_t pred_desc)
4036 {
4037     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4038     if (last_active_pred(vn, vg, oprsz)) {
4039         return compute_brks_z(vd, vm, vg, oprsz, false);
4040     } else {
4041         return do_zero(vd, oprsz);
4042     }
4043 }
4044 
4045 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4046 {
4047     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4048     compute_brk_z(vd, vn, vg, oprsz, true);
4049 }
4050 
4051 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4052 {
4053     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4054     return compute_brks_z(vd, vn, vg, oprsz, true);
4055 }
4056 
4057 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4058 {
4059     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4060     compute_brk_z(vd, vn, vg, oprsz, false);
4061 }
4062 
4063 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4064 {
4065     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4066     return compute_brks_z(vd, vn, vg, oprsz, false);
4067 }
4068 
4069 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4070 {
4071     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4072     compute_brk_m(vd, vn, vg, oprsz, true);
4073 }
4074 
4075 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4076 {
4077     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4078     return compute_brks_m(vd, vn, vg, oprsz, true);
4079 }
4080 
4081 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4082 {
4083     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4084     compute_brk_m(vd, vn, vg, oprsz, false);
4085 }
4086 
4087 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4088 {
4089     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4090     return compute_brks_m(vd, vn, vg, oprsz, false);
4091 }
4092 
4093 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4094 {
4095     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4096     if (!last_active_pred(vn, vg, oprsz)) {
4097         do_zero(vd, oprsz);
4098     }
4099 }
4100 
4101 /* As if PredTest(Ones(PL), D, esz).  */
4102 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4103                               uint64_t esz_mask)
4104 {
4105     uint32_t flags = PREDTEST_INIT;
4106     intptr_t i;
4107 
4108     for (i = 0; i < oprsz / 8; i++) {
4109         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4110     }
4111     if (oprsz & 7) {
4112         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4113         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4114     }
4115     return flags;
4116 }
4117 
4118 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4119 {
4120     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4121     if (last_active_pred(vn, vg, oprsz)) {
4122         return predtest_ones(vd, oprsz, -1);
4123     } else {
4124         return do_zero(vd, oprsz);
4125     }
4126 }
4127 
4128 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4129 {
4130     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4131     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4132     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4133     intptr_t i;
4134 
4135     for (i = 0; i < words; ++i) {
4136         uint64_t t = n[i] & g[i] & mask;
4137         sum += ctpop64(t);
4138     }
4139     return sum;
4140 }
4141 
4142 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4143 {
4144     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4145     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4146     uint64_t esz_mask = pred_esz_masks[esz];
4147     ARMPredicateReg *d = vd;
4148     uint32_t flags;
4149     intptr_t i;
4150 
4151     /* Begin with a zero predicate register.  */
4152     flags = do_zero(d, oprsz);
4153     if (count == 0) {
4154         return flags;
4155     }
4156 
4157     /* Set all of the requested bits.  */
4158     for (i = 0; i < count / 64; ++i) {
4159         d->p[i] = esz_mask;
4160     }
4161     if (count & 63) {
4162         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4163     }
4164 
4165     return predtest_ones(d, oprsz, esz_mask);
4166 }
4167 
4168 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4169 {
4170     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4171     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4172     uint64_t esz_mask = pred_esz_masks[esz];
4173     ARMPredicateReg *d = vd;
4174     intptr_t i, invcount, oprbits;
4175     uint64_t bits;
4176 
4177     if (count == 0) {
4178         return do_zero(d, oprsz);
4179     }
4180 
4181     oprbits = oprsz * 8;
4182     tcg_debug_assert(count <= oprbits);
4183 
4184     bits = esz_mask;
4185     if (oprbits & 63) {
4186         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4187     }
4188 
4189     invcount = oprbits - count;
4190     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4191         d->p[i] = bits;
4192         bits = esz_mask;
4193     }
4194 
4195     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4196 
4197     while (--i >= 0) {
4198         d->p[i] = 0;
4199     }
4200 
4201     return predtest_ones(d, oprsz, esz_mask);
4202 }
4203 
4204 /* Recursive reduction on a function;
4205  * C.f. the ARM ARM function ReducePredicated.
4206  *
4207  * While it would be possible to write this without the DATA temporary,
4208  * it is much simpler to process the predicate register this way.
4209  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4210  * little to gain with a more complex non-recursive form.
4211  */
4212 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4213 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4214 {                                                                     \
4215     if (n == 1) {                                                     \
4216         return *data;                                                 \
4217     } else {                                                          \
4218         uintptr_t half = n / 2;                                       \
4219         TYPE lo = NAME##_reduce(data, status, half);                  \
4220         TYPE hi = NAME##_reduce(data + half, status, half);           \
4221         return FUNC(lo, hi, status);                                  \
4222     }                                                                 \
4223 }                                                                     \
4224 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
4225 {                                                                     \
4226     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4227     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4228     for (i = 0; i < oprsz; ) {                                        \
4229         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4230         do {                                                          \
4231             TYPE nn = *(TYPE *)(vn + H(i));                           \
4232             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4233             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4234         } while (i & 15);                                             \
4235     }                                                                 \
4236     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4237         *(TYPE *)((void *)data + i) = IDENT;                          \
4238     }                                                                 \
4239     return NAME##_reduce(data, s, maxsz / sizeof(TYPE));              \
4240 }
4241 
4242 DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero)
4243 DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero)
4244 DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero)
4245 
4246 /* Identity is floatN_default_nan, without the function call.  */
4247 DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00)
4248 DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000)
4249 DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL)
4250 
4251 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00)
4252 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000)
4253 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL)
4254 
4255 DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity)
4256 DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity)
4257 DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity)
4258 
4259 DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity))
4260 DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity))
4261 DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity))
4262 
4263 DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity)
4264 DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity)
4265 DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity)
4266 
4267 DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh,
4268           float16_chs(float16_infinity))
4269 DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs,
4270           float32_chs(float32_infinity))
4271 DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd,
4272           float64_chs(float64_infinity))
4273 
4274 #undef DO_REDUCE
4275 
4276 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4277                              float_status *status, uint32_t desc)
4278 {
4279     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4280     float16 result = nn;
4281 
4282     do {
4283         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4284         do {
4285             if (pg & 1) {
4286                 float16 mm = *(float16 *)(vm + H1_2(i));
4287                 result = float16_add(result, mm, status);
4288             }
4289             i += sizeof(float16), pg >>= sizeof(float16);
4290         } while (i & 15);
4291     } while (i < opr_sz);
4292 
4293     return result;
4294 }
4295 
4296 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4297                              float_status *status, uint32_t desc)
4298 {
4299     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4300     float32 result = nn;
4301 
4302     do {
4303         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4304         do {
4305             if (pg & 1) {
4306                 float32 mm = *(float32 *)(vm + H1_2(i));
4307                 result = float32_add(result, mm, status);
4308             }
4309             i += sizeof(float32), pg >>= sizeof(float32);
4310         } while (i & 15);
4311     } while (i < opr_sz);
4312 
4313     return result;
4314 }
4315 
4316 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4317                              float_status *status, uint32_t desc)
4318 {
4319     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4320     uint64_t *m = vm;
4321     uint8_t *pg = vg;
4322 
4323     for (i = 0; i < opr_sz; i++) {
4324         if (pg[H1(i)] & 1) {
4325             nn = float64_add(nn, m[i], status);
4326         }
4327     }
4328 
4329     return nn;
4330 }
4331 
4332 /* Fully general three-operand expander, controlled by a predicate,
4333  * With the extra float_status parameter.
4334  */
4335 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4336 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4337                   float_status *status, uint32_t desc)          \
4338 {                                                               \
4339     intptr_t i = simd_oprsz(desc);                              \
4340     uint64_t *g = vg;                                           \
4341     do {                                                        \
4342         uint64_t pg = g[(i - 1) >> 6];                          \
4343         do {                                                    \
4344             i -= sizeof(TYPE);                                  \
4345             if (likely((pg >> (i & 63)) & 1)) {                 \
4346                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4347                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4348                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4349             }                                                   \
4350         } while (i & 63);                                       \
4351     } while (i != 0);                                           \
4352 }
4353 
4354 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4355 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4356 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4357 
4358 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4359 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4360 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4361 
4362 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4363 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4364 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4365 
4366 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4367 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4368 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4369 
4370 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4371 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4372 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4373 
4374 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4375 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4376 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4377 
4378 DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh)
4379 DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins)
4380 DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind)
4381 
4382 DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh)
4383 DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs)
4384 DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd)
4385 
4386 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4387 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4388 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4389 
4390 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4391 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4392 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4393 
4394 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4395 {
4396     return float16_abs(float16_sub(a, b, s));
4397 }
4398 
4399 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4400 {
4401     return float32_abs(float32_sub(a, b, s));
4402 }
4403 
4404 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4405 {
4406     return float64_abs(float64_sub(a, b, s));
4407 }
4408 
4409 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */
4410 static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat)
4411 {
4412     float16 r = float16_sub(op1, op2, stat);
4413     return float16_is_any_nan(r) ? r : float16_abs(r);
4414 }
4415 
4416 static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat)
4417 {
4418     float32 r = float32_sub(op1, op2, stat);
4419     return float32_is_any_nan(r) ? r : float32_abs(r);
4420 }
4421 
4422 static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat)
4423 {
4424     float64 r = float64_sub(op1, op2, stat);
4425     return float64_is_any_nan(r) ? r : float64_abs(r);
4426 }
4427 
4428 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4429 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4430 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4431 DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h)
4432 DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s)
4433 DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d)
4434 
4435 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4436 {
4437     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4438     return float64_scalbn(a, b_int, s);
4439 }
4440 
4441 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4442 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4443 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4444 
4445 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4446 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4447 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4448 
4449 #undef DO_ZPZZ_FP
4450 
4451 /* Three-operand expander, with one scalar operand, controlled by
4452  * a predicate, with the extra float_status parameter.
4453  */
4454 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4455 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4456                   float_status *status, uint32_t desc)            \
4457 {                                                                 \
4458     intptr_t i = simd_oprsz(desc);                                \
4459     uint64_t *g = vg;                                             \
4460     TYPE mm = scalar;                                             \
4461     do {                                                          \
4462         uint64_t pg = g[(i - 1) >> 6];                            \
4463         do {                                                      \
4464             i -= sizeof(TYPE);                                    \
4465             if (likely((pg >> (i & 63)) & 1)) {                   \
4466                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4467                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4468             }                                                     \
4469         } while (i & 63);                                         \
4470     } while (i != 0);                                             \
4471 }
4472 
4473 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4474 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4475 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4476 
4477 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4478 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4479 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4480 
4481 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4482 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4483 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4484 
4485 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4486 {
4487     return float16_sub(b, a, s);
4488 }
4489 
4490 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4491 {
4492     return float32_sub(b, a, s);
4493 }
4494 
4495 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4496 {
4497     return float64_sub(b, a, s);
4498 }
4499 
4500 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4501 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4502 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4503 
4504 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4505 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4506 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4507 
4508 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4509 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4510 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4511 
4512 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4513 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4514 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4515 
4516 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4517 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4518 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4519 
4520 DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh)
4521 DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs)
4522 DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd)
4523 
4524 DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh)
4525 DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins)
4526 DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind)
4527 
4528 /* Fully general two-operand expander, controlled by a predicate,
4529  * With the extra float_status parameter.
4530  */
4531 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4532 void HELPER(NAME)(void *vd, void *vn, void *vg,                       \
4533                   float_status *status, uint32_t desc)                \
4534 {                                                                     \
4535     intptr_t i = simd_oprsz(desc);                                    \
4536     uint64_t *g = vg;                                                 \
4537     do {                                                              \
4538         uint64_t pg = g[(i - 1) >> 6];                                \
4539         do {                                                          \
4540             i -= sizeof(TYPE);                                        \
4541             if (likely((pg >> (i & 63)) & 1)) {                       \
4542                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4543                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4544             }                                                         \
4545         } while (i & 63);                                             \
4546     } while (i != 0);                                                 \
4547 }
4548 
4549 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4550  * FZ16.  When converting from fp16, this affects flushing input denormals;
4551  * when converting to fp16, this affects flushing output denormals.
4552  */
4553 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4554 {
4555     bool save = get_flush_inputs_to_zero(fpst);
4556     float32 ret;
4557 
4558     set_flush_inputs_to_zero(false, fpst);
4559     ret = float16_to_float32(f, true, fpst);
4560     set_flush_inputs_to_zero(save, fpst);
4561     return ret;
4562 }
4563 
4564 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4565 {
4566     bool save = get_flush_inputs_to_zero(fpst);
4567     float64 ret;
4568 
4569     set_flush_inputs_to_zero(false, fpst);
4570     ret = float16_to_float64(f, true, fpst);
4571     set_flush_inputs_to_zero(save, fpst);
4572     return ret;
4573 }
4574 
4575 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4576 {
4577     bool save = get_flush_to_zero(fpst);
4578     float16 ret;
4579 
4580     set_flush_to_zero(false, fpst);
4581     ret = float32_to_float16(f, true, fpst);
4582     set_flush_to_zero(save, fpst);
4583     return ret;
4584 }
4585 
4586 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4587 {
4588     bool save = get_flush_to_zero(fpst);
4589     float16 ret;
4590 
4591     set_flush_to_zero(false, fpst);
4592     ret = float64_to_float16(f, true, fpst);
4593     set_flush_to_zero(save, fpst);
4594     return ret;
4595 }
4596 
4597 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4598 {
4599     if (float16_is_any_nan(f)) {
4600         float_raise(float_flag_invalid, s);
4601         return 0;
4602     }
4603     return float16_to_int16_round_to_zero(f, s);
4604 }
4605 
4606 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4607 {
4608     if (float16_is_any_nan(f)) {
4609         float_raise(float_flag_invalid, s);
4610         return 0;
4611     }
4612     return float16_to_int64_round_to_zero(f, s);
4613 }
4614 
4615 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4616 {
4617     if (float32_is_any_nan(f)) {
4618         float_raise(float_flag_invalid, s);
4619         return 0;
4620     }
4621     return float32_to_int64_round_to_zero(f, s);
4622 }
4623 
4624 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4625 {
4626     if (float64_is_any_nan(f)) {
4627         float_raise(float_flag_invalid, s);
4628         return 0;
4629     }
4630     return float64_to_int64_round_to_zero(f, s);
4631 }
4632 
4633 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4634 {
4635     if (float16_is_any_nan(f)) {
4636         float_raise(float_flag_invalid, s);
4637         return 0;
4638     }
4639     return float16_to_uint16_round_to_zero(f, s);
4640 }
4641 
4642 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4643 {
4644     if (float16_is_any_nan(f)) {
4645         float_raise(float_flag_invalid, s);
4646         return 0;
4647     }
4648     return float16_to_uint64_round_to_zero(f, s);
4649 }
4650 
4651 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4652 {
4653     if (float32_is_any_nan(f)) {
4654         float_raise(float_flag_invalid, s);
4655         return 0;
4656     }
4657     return float32_to_uint64_round_to_zero(f, s);
4658 }
4659 
4660 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4661 {
4662     if (float64_is_any_nan(f)) {
4663         float_raise(float_flag_invalid, s);
4664         return 0;
4665     }
4666     return float64_to_uint64_round_to_zero(f, s);
4667 }
4668 
4669 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4670 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4671 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4672 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4673 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4674 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4675 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4676 
4677 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4678 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4679 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4680 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4681 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4682 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4683 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4684 
4685 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4686 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4687 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4688 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4689 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4690 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4691 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4692 
4693 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4694 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4695 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4696 
4697 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4698 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4699 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4700 
4701 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4702 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4703 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4704 
4705 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4706 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4707 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4708 
4709 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4710 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4711 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4712 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4713 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4714 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4715 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4716 
4717 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4718 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4719 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4720 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4721 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4722 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4723 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4724 
4725 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4726 {
4727     /* Extract frac to the top of the uint32_t. */
4728     uint32_t frac = (uint32_t)a << (16 + 6);
4729     int16_t exp = extract32(a, 10, 5);
4730 
4731     if (unlikely(exp == 0)) {
4732         if (frac != 0) {
4733             if (!get_flush_inputs_to_zero(s)) {
4734                 /* denormal: bias - fractional_zeros */
4735                 return -15 - clz32(frac);
4736             }
4737             /* flush to zero */
4738             float_raise(float_flag_input_denormal_flushed, s);
4739         }
4740     } else if (unlikely(exp == 0x1f)) {
4741         if (frac == 0) {
4742             return INT16_MAX; /* infinity */
4743         }
4744     } else {
4745         /* normal: exp - bias */
4746         return exp - 15;
4747     }
4748     /* nan or zero */
4749     float_raise(float_flag_invalid, s);
4750     return INT16_MIN;
4751 }
4752 
4753 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4754 {
4755     /* Extract frac to the top of the uint32_t. */
4756     uint32_t frac = a << 9;
4757     int32_t exp = extract32(a, 23, 8);
4758 
4759     if (unlikely(exp == 0)) {
4760         if (frac != 0) {
4761             if (!get_flush_inputs_to_zero(s)) {
4762                 /* denormal: bias - fractional_zeros */
4763                 return -127 - clz32(frac);
4764             }
4765             /* flush to zero */
4766             float_raise(float_flag_input_denormal_flushed, s);
4767         }
4768     } else if (unlikely(exp == 0xff)) {
4769         if (frac == 0) {
4770             return INT32_MAX; /* infinity */
4771         }
4772     } else {
4773         /* normal: exp - bias */
4774         return exp - 127;
4775     }
4776     /* nan or zero */
4777     float_raise(float_flag_invalid, s);
4778     return INT32_MIN;
4779 }
4780 
4781 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4782 {
4783     /* Extract frac to the top of the uint64_t. */
4784     uint64_t frac = a << 12;
4785     int64_t exp = extract64(a, 52, 11);
4786 
4787     if (unlikely(exp == 0)) {
4788         if (frac != 0) {
4789             if (!get_flush_inputs_to_zero(s)) {
4790                 /* denormal: bias - fractional_zeros */
4791                 return -1023 - clz64(frac);
4792             }
4793             /* flush to zero */
4794             float_raise(float_flag_input_denormal_flushed, s);
4795         }
4796     } else if (unlikely(exp == 0x7ff)) {
4797         if (frac == 0) {
4798             return INT64_MAX; /* infinity */
4799         }
4800     } else {
4801         /* normal: exp - bias */
4802         return exp - 1023;
4803     }
4804     /* nan or zero */
4805     float_raise(float_flag_invalid, s);
4806     return INT64_MIN;
4807 }
4808 
4809 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4810 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4811 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4812 
4813 #undef DO_ZPZ_FP
4814 
4815 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4816                             float_status *status, uint32_t desc,
4817                             uint16_t neg1, uint16_t neg3, int flags)
4818 {
4819     intptr_t i = simd_oprsz(desc);
4820     uint64_t *g = vg;
4821 
4822     do {
4823         uint64_t pg = g[(i - 1) >> 6];
4824         do {
4825             i -= 2;
4826             if (likely((pg >> (i & 63)) & 1)) {
4827                 float16 e1, e2, e3, r;
4828 
4829                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4830                 e2 = *(uint16_t *)(vm + H1_2(i));
4831                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4832                 r = float16_muladd(e1, e2, e3, flags, status);
4833                 *(uint16_t *)(vd + H1_2(i)) = r;
4834             }
4835         } while (i & 63);
4836     } while (i != 0);
4837 }
4838 
4839 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4840                               void *vg, float_status *status, uint32_t desc)
4841 {
4842     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4843 }
4844 
4845 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4846                               void *vg, float_status *status, uint32_t desc)
4847 {
4848     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0);
4849 }
4850 
4851 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4852                                void *vg, float_status *status, uint32_t desc)
4853 {
4854     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0);
4855 }
4856 
4857 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4858                                void *vg, float_status *status, uint32_t desc)
4859 {
4860     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0);
4861 }
4862 
4863 void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4864                               void *vg, float_status *status, uint32_t desc)
4865 {
4866     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4867                     float_muladd_negate_product);
4868 }
4869 
4870 void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4871                                void *vg, float_status *status, uint32_t desc)
4872 {
4873     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4874                     float_muladd_negate_product | float_muladd_negate_c);
4875 }
4876 
4877 void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4878                                void *vg, float_status *status, uint32_t desc)
4879 {
4880     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0,
4881                     float_muladd_negate_c);
4882 }
4883 
4884 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4885                             float_status *status, uint32_t desc,
4886                             uint32_t neg1, uint32_t neg3, int flags)
4887 {
4888     intptr_t i = simd_oprsz(desc);
4889     uint64_t *g = vg;
4890 
4891     do {
4892         uint64_t pg = g[(i - 1) >> 6];
4893         do {
4894             i -= 4;
4895             if (likely((pg >> (i & 63)) & 1)) {
4896                 float32 e1, e2, e3, r;
4897 
4898                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4899                 e2 = *(uint32_t *)(vm + H1_4(i));
4900                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4901                 r = float32_muladd(e1, e2, e3, flags, status);
4902                 *(uint32_t *)(vd + H1_4(i)) = r;
4903             }
4904         } while (i & 63);
4905     } while (i != 0);
4906 }
4907 
4908 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4909                               void *vg, float_status *status, uint32_t desc)
4910 {
4911     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4912 }
4913 
4914 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4915                               void *vg, float_status *status, uint32_t desc)
4916 {
4917     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0);
4918 }
4919 
4920 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4921                                void *vg, float_status *status, uint32_t desc)
4922 {
4923     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0);
4924 }
4925 
4926 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4927                                void *vg, float_status *status, uint32_t desc)
4928 {
4929     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0);
4930 }
4931 
4932 void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4933                               void *vg, float_status *status, uint32_t desc)
4934 {
4935     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4936                     float_muladd_negate_product);
4937 }
4938 
4939 void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4940                                void *vg, float_status *status, uint32_t desc)
4941 {
4942     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4943                     float_muladd_negate_product | float_muladd_negate_c);
4944 }
4945 
4946 void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4947                                void *vg, float_status *status, uint32_t desc)
4948 {
4949     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0,
4950                     float_muladd_negate_c);
4951 }
4952 
4953 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4954                             float_status *status, uint32_t desc,
4955                             uint64_t neg1, uint64_t neg3, int flags)
4956 {
4957     intptr_t i = simd_oprsz(desc);
4958     uint64_t *g = vg;
4959 
4960     do {
4961         uint64_t pg = g[(i - 1) >> 6];
4962         do {
4963             i -= 8;
4964             if (likely((pg >> (i & 63)) & 1)) {
4965                 float64 e1, e2, e3, r;
4966 
4967                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4968                 e2 = *(uint64_t *)(vm + i);
4969                 e3 = *(uint64_t *)(va + i) ^ neg3;
4970                 r = float64_muladd(e1, e2, e3, flags, status);
4971                 *(uint64_t *)(vd + i) = r;
4972             }
4973         } while (i & 63);
4974     } while (i != 0);
4975 }
4976 
4977 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4978                               void *vg, float_status *status, uint32_t desc)
4979 {
4980     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0);
4981 }
4982 
4983 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4984                               void *vg, float_status *status, uint32_t desc)
4985 {
4986     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0);
4987 }
4988 
4989 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4990                                void *vg, float_status *status, uint32_t desc)
4991 {
4992     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0);
4993 }
4994 
4995 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4996                                void *vg, float_status *status, uint32_t desc)
4997 {
4998     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0);
4999 }
5000 
5001 void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5002                               void *vg, float_status *status, uint32_t desc)
5003 {
5004     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5005                     float_muladd_negate_product);
5006 }
5007 
5008 void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5009                                void *vg, float_status *status, uint32_t desc)
5010 {
5011     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5012                     float_muladd_negate_product | float_muladd_negate_c);
5013 }
5014 
5015 void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5016                                void *vg, float_status *status, uint32_t desc)
5017 {
5018     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0,
5019                     float_muladd_negate_c);
5020 }
5021 
5022 /* Two operand floating-point comparison controlled by a predicate.
5023  * Unlike the integer version, we are not allowed to optimistically
5024  * compare operands, since the comparison may have side effects wrt
5025  * the FPSR.
5026  */
5027 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
5028 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
5029                   float_status *status, uint32_t desc)                  \
5030 {                                                                       \
5031     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
5032     uint64_t *d = vd, *g = vg;                                          \
5033     do {                                                                \
5034         uint64_t out = 0, pg = g[j];                                    \
5035         do {                                                            \
5036             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
5037             if (likely((pg >> (i & 63)) & 1)) {                         \
5038                 TYPE nn = *(TYPE *)(vn + H(i));                         \
5039                 TYPE mm = *(TYPE *)(vm + H(i));                         \
5040                 out |= OP(TYPE, nn, mm, status);                        \
5041             }                                                           \
5042         } while (i & 63);                                               \
5043         d[j--] = out;                                                   \
5044     } while (i > 0);                                                    \
5045 }
5046 
5047 #define DO_FPCMP_PPZZ_H(NAME, OP) \
5048     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
5049 #define DO_FPCMP_PPZZ_S(NAME, OP) \
5050     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
5051 #define DO_FPCMP_PPZZ_D(NAME, OP) \
5052     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
5053 
5054 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
5055     DO_FPCMP_PPZZ_H(NAME, OP)   \
5056     DO_FPCMP_PPZZ_S(NAME, OP)   \
5057     DO_FPCMP_PPZZ_D(NAME, OP)
5058 
5059 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
5060 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
5061 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
5062 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
5063 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
5064 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
5065 #define DO_FCMUO(TYPE, X, Y, ST)  \
5066     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
5067 #define DO_FACGE(TYPE, X, Y, ST)  \
5068     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
5069 #define DO_FACGT(TYPE, X, Y, ST)  \
5070     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
5071 
5072 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
5073 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
5074 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
5075 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
5076 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
5077 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
5078 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
5079 
5080 #undef DO_FPCMP_PPZZ_ALL
5081 #undef DO_FPCMP_PPZZ_D
5082 #undef DO_FPCMP_PPZZ_S
5083 #undef DO_FPCMP_PPZZ_H
5084 #undef DO_FPCMP_PPZZ
5085 
5086 /* One operand floating-point comparison against zero, controlled
5087  * by a predicate.
5088  */
5089 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
5090 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
5091                   float_status *status, uint32_t desc)     \
5092 {                                                          \
5093     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
5094     uint64_t *d = vd, *g = vg;                             \
5095     do {                                                   \
5096         uint64_t out = 0, pg = g[j];                       \
5097         do {                                               \
5098             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
5099             if ((pg >> (i & 63)) & 1) {                    \
5100                 TYPE nn = *(TYPE *)(vn + H(i));            \
5101                 out |= OP(TYPE, nn, 0, status);            \
5102             }                                              \
5103         } while (i & 63);                                  \
5104         d[j--] = out;                                      \
5105     } while (i > 0);                                       \
5106 }
5107 
5108 #define DO_FPCMP_PPZ0_H(NAME, OP) \
5109     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
5110 #define DO_FPCMP_PPZ0_S(NAME, OP) \
5111     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
5112 #define DO_FPCMP_PPZ0_D(NAME, OP) \
5113     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
5114 
5115 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
5116     DO_FPCMP_PPZ0_H(NAME, OP)   \
5117     DO_FPCMP_PPZ0_S(NAME, OP)   \
5118     DO_FPCMP_PPZ0_D(NAME, OP)
5119 
5120 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
5121 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
5122 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
5123 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
5124 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
5125 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
5126 
5127 /* FP Trig Multiply-Add. */
5128 
5129 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
5130                          float_status *s, uint32_t desc)
5131 {
5132     static const float16 coeff[16] = {
5133         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5134         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
5135     };
5136     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
5137     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5138     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5139     float16 *d = vd, *n = vn, *m = vm;
5140 
5141     for (i = 0; i < opr_sz; i++) {
5142         float16 mm = m[i];
5143         intptr_t xx = x;
5144         int flags = 0;
5145 
5146         if (float16_is_neg(mm)) {
5147             if (fpcr_ah) {
5148                 flags = float_muladd_negate_product;
5149             } else {
5150                 mm = float16_abs(mm);
5151             }
5152             xx += 8;
5153         }
5154         d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s);
5155     }
5156 }
5157 
5158 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5159                          float_status *s, uint32_t desc)
5160 {
5161     static const float32 coeff[16] = {
5162         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5163         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5164         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5165         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5166     };
5167     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5168     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5169     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5170     float32 *d = vd, *n = vn, *m = vm;
5171 
5172     for (i = 0; i < opr_sz; i++) {
5173         float32 mm = m[i];
5174         intptr_t xx = x;
5175         int flags = 0;
5176 
5177         if (float32_is_neg(mm)) {
5178             if (fpcr_ah) {
5179                 flags = float_muladd_negate_product;
5180             } else {
5181                 mm = float32_abs(mm);
5182             }
5183             xx += 8;
5184         }
5185         d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s);
5186     }
5187 }
5188 
5189 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5190                          float_status *s, uint32_t desc)
5191 {
5192     static const float64 coeff[16] = {
5193         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5194         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5195         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5196         0x3de5d8408868552full, 0x0000000000000000ull,
5197         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5198         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5199         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5200         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5201     };
5202     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5203     intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3);
5204     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1);
5205     float64 *d = vd, *n = vn, *m = vm;
5206 
5207     for (i = 0; i < opr_sz; i++) {
5208         float64 mm = m[i];
5209         intptr_t xx = x;
5210         int flags = 0;
5211 
5212         if (float64_is_neg(mm)) {
5213             if (fpcr_ah) {
5214                 flags = float_muladd_negate_product;
5215             } else {
5216                 mm = float64_abs(mm);
5217             }
5218             xx += 8;
5219         }
5220         d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s);
5221     }
5222 }
5223 
5224 /*
5225  * FP Complex Add
5226  */
5227 
5228 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5229                          float_status *s, uint32_t desc)
5230 {
5231     intptr_t j, i = simd_oprsz(desc);
5232     uint64_t *g = vg;
5233     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5234     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5235 
5236     do {
5237         uint64_t pg = g[(i - 1) >> 6];
5238         do {
5239             float16 e0, e1, e2, e3;
5240 
5241             /* I holds the real index; J holds the imag index.  */
5242             j = i - sizeof(float16);
5243             i -= 2 * sizeof(float16);
5244 
5245             e0 = *(float16 *)(vn + H1_2(i));
5246             e1 = *(float16 *)(vm + H1_2(j));
5247             e2 = *(float16 *)(vn + H1_2(j));
5248             e3 = *(float16 *)(vm + H1_2(i));
5249 
5250             if (rot) {
5251                 e3 = float16_maybe_ah_chs(e3, fpcr_ah);
5252             } else {
5253                 e1 = float16_maybe_ah_chs(e1, fpcr_ah);
5254             }
5255 
5256             if (likely((pg >> (i & 63)) & 1)) {
5257                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5258             }
5259             if (likely((pg >> (j & 63)) & 1)) {
5260                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5261             }
5262         } while (i & 63);
5263     } while (i != 0);
5264 }
5265 
5266 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5267                          float_status *s, uint32_t desc)
5268 {
5269     intptr_t j, i = simd_oprsz(desc);
5270     uint64_t *g = vg;
5271     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5272     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5273 
5274     do {
5275         uint64_t pg = g[(i - 1) >> 6];
5276         do {
5277             float32 e0, e1, e2, e3;
5278 
5279             /* I holds the real index; J holds the imag index.  */
5280             j = i - sizeof(float32);
5281             i -= 2 * sizeof(float32);
5282 
5283             e0 = *(float32 *)(vn + H1_2(i));
5284             e1 = *(float32 *)(vm + H1_2(j));
5285             e2 = *(float32 *)(vn + H1_2(j));
5286             e3 = *(float32 *)(vm + H1_2(i));
5287 
5288             if (rot) {
5289                 e3 = float32_maybe_ah_chs(e3, fpcr_ah);
5290             } else {
5291                 e1 = float32_maybe_ah_chs(e1, fpcr_ah);
5292             }
5293 
5294             if (likely((pg >> (i & 63)) & 1)) {
5295                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5296             }
5297             if (likely((pg >> (j & 63)) & 1)) {
5298                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5299             }
5300         } while (i & 63);
5301     } while (i != 0);
5302 }
5303 
5304 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5305                          float_status *s, uint32_t desc)
5306 {
5307     intptr_t j, i = simd_oprsz(desc);
5308     uint64_t *g = vg;
5309     bool rot = extract32(desc, SIMD_DATA_SHIFT, 1);
5310     bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5311 
5312     do {
5313         uint64_t pg = g[(i - 1) >> 6];
5314         do {
5315             float64 e0, e1, e2, e3;
5316 
5317             /* I holds the real index; J holds the imag index.  */
5318             j = i - sizeof(float64);
5319             i -= 2 * sizeof(float64);
5320 
5321             e0 = *(float64 *)(vn + H1_2(i));
5322             e1 = *(float64 *)(vm + H1_2(j));
5323             e2 = *(float64 *)(vn + H1_2(j));
5324             e3 = *(float64 *)(vm + H1_2(i));
5325 
5326             if (rot) {
5327                 e3 = float64_maybe_ah_chs(e3, fpcr_ah);
5328             } else {
5329                 e1 = float64_maybe_ah_chs(e1, fpcr_ah);
5330             }
5331 
5332             if (likely((pg >> (i & 63)) & 1)) {
5333                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5334             }
5335             if (likely((pg >> (j & 63)) & 1)) {
5336                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5337             }
5338         } while (i & 63);
5339     } while (i != 0);
5340 }
5341 
5342 /*
5343  * FP Complex Multiply
5344  */
5345 
5346 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5347                                void *vg, float_status *status, uint32_t desc)
5348 {
5349     intptr_t j, i = simd_oprsz(desc);
5350     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5351     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5352     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5353     uint32_t negf_real = flip ^ negf_imag;
5354     float16 negx_imag, negx_real;
5355     uint64_t *g = vg;
5356 
5357     /* With AH=0, use negx; with AH=1 use negf. */
5358     negx_real = (negf_real & ~fpcr_ah) << 15;
5359     negx_imag = (negf_imag & ~fpcr_ah) << 15;
5360     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5361     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5362 
5363     do {
5364         uint64_t pg = g[(i - 1) >> 6];
5365         do {
5366             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5367 
5368             /* I holds the real index; J holds the imag index.  */
5369             j = i - sizeof(float16);
5370             i -= 2 * sizeof(float16);
5371 
5372             nr = *(float16 *)(vn + H1_2(i));
5373             ni = *(float16 *)(vn + H1_2(j));
5374             mr = *(float16 *)(vm + H1_2(i));
5375             mi = *(float16 *)(vm + H1_2(j));
5376 
5377             e2 = (flip ? ni : nr);
5378             e1 = (flip ? mi : mr) ^ negx_real;
5379             e4 = e2;
5380             e3 = (flip ? mr : mi) ^ negx_imag;
5381 
5382             if (likely((pg >> (i & 63)) & 1)) {
5383                 d = *(float16 *)(va + H1_2(i));
5384                 d = float16_muladd(e2, e1, d, negf_real, status);
5385                 *(float16 *)(vd + H1_2(i)) = d;
5386             }
5387             if (likely((pg >> (j & 63)) & 1)) {
5388                 d = *(float16 *)(va + H1_2(j));
5389                 d = float16_muladd(e4, e3, d, negf_imag, status);
5390                 *(float16 *)(vd + H1_2(j)) = d;
5391             }
5392         } while (i & 63);
5393     } while (i != 0);
5394 }
5395 
5396 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5397                                void *vg, float_status *status, uint32_t desc)
5398 {
5399     intptr_t j, i = simd_oprsz(desc);
5400     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5401     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5402     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5403     uint32_t negf_real = flip ^ negf_imag;
5404     float32 negx_imag, negx_real;
5405     uint64_t *g = vg;
5406 
5407     /* With AH=0, use negx; with AH=1 use negf. */
5408     negx_real = (negf_real & ~fpcr_ah) << 31;
5409     negx_imag = (negf_imag & ~fpcr_ah) << 31;
5410     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5411     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5412 
5413     do {
5414         uint64_t pg = g[(i - 1) >> 6];
5415         do {
5416             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5417 
5418             /* I holds the real index; J holds the imag index.  */
5419             j = i - sizeof(float32);
5420             i -= 2 * sizeof(float32);
5421 
5422             nr = *(float32 *)(vn + H1_2(i));
5423             ni = *(float32 *)(vn + H1_2(j));
5424             mr = *(float32 *)(vm + H1_2(i));
5425             mi = *(float32 *)(vm + H1_2(j));
5426 
5427             e2 = (flip ? ni : nr);
5428             e1 = (flip ? mi : mr) ^ negx_real;
5429             e4 = e2;
5430             e3 = (flip ? mr : mi) ^ negx_imag;
5431 
5432             if (likely((pg >> (i & 63)) & 1)) {
5433                 d = *(float32 *)(va + H1_2(i));
5434                 d = float32_muladd(e2, e1, d, negf_real, status);
5435                 *(float32 *)(vd + H1_2(i)) = d;
5436             }
5437             if (likely((pg >> (j & 63)) & 1)) {
5438                 d = *(float32 *)(va + H1_2(j));
5439                 d = float32_muladd(e4, e3, d, negf_imag, status);
5440                 *(float32 *)(vd + H1_2(j)) = d;
5441             }
5442         } while (i & 63);
5443     } while (i != 0);
5444 }
5445 
5446 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5447                                void *vg, float_status *status, uint32_t desc)
5448 {
5449     intptr_t j, i = simd_oprsz(desc);
5450     bool flip = extract32(desc, SIMD_DATA_SHIFT, 1);
5451     uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1);
5452     uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
5453     uint32_t negf_real = flip ^ negf_imag;
5454     float64 negx_imag, negx_real;
5455     uint64_t *g = vg;
5456 
5457     /* With AH=0, use negx; with AH=1 use negf. */
5458     negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63;
5459     negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63;
5460     negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0);
5461     negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0);
5462 
5463     do {
5464         uint64_t pg = g[(i - 1) >> 6];
5465         do {
5466             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5467 
5468             /* I holds the real index; J holds the imag index.  */
5469             j = i - sizeof(float64);
5470             i -= 2 * sizeof(float64);
5471 
5472             nr = *(float64 *)(vn + H1_2(i));
5473             ni = *(float64 *)(vn + H1_2(j));
5474             mr = *(float64 *)(vm + H1_2(i));
5475             mi = *(float64 *)(vm + H1_2(j));
5476 
5477             e2 = (flip ? ni : nr);
5478             e1 = (flip ? mi : mr) ^ negx_real;
5479             e4 = e2;
5480             e3 = (flip ? mr : mi) ^ negx_imag;
5481 
5482             if (likely((pg >> (i & 63)) & 1)) {
5483                 d = *(float64 *)(va + H1_2(i));
5484                 d = float64_muladd(e2, e1, d, negf_real, status);
5485                 *(float64 *)(vd + H1_2(i)) = d;
5486             }
5487             if (likely((pg >> (j & 63)) & 1)) {
5488                 d = *(float64 *)(va + H1_2(j));
5489                 d = float64_muladd(e4, e3, d, negf_imag, status);
5490                 *(float64 *)(vd + H1_2(j)) = d;
5491             }
5492         } while (i & 63);
5493     } while (i != 0);
5494 }
5495 
5496 /*
5497  * Load contiguous data, protected by a governing predicate.
5498  */
5499 
5500 /*
5501  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5502  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5503  * element >= @reg_off, or @reg_max if there were no active elements at all.
5504  */
5505 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5506                                  intptr_t reg_max, int esz)
5507 {
5508     uint64_t pg_mask = pred_esz_masks[esz];
5509     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5510 
5511     /* In normal usage, the first element is active.  */
5512     if (likely(pg & 1)) {
5513         return reg_off;
5514     }
5515 
5516     if (pg == 0) {
5517         reg_off &= -64;
5518         do {
5519             reg_off += 64;
5520             if (unlikely(reg_off >= reg_max)) {
5521                 /* The entire predicate was false.  */
5522                 return reg_max;
5523             }
5524             pg = vg[reg_off >> 6] & pg_mask;
5525         } while (pg == 0);
5526     }
5527     reg_off += ctz64(pg);
5528 
5529     /* We should never see an out of range predicate bit set.  */
5530     tcg_debug_assert(reg_off < reg_max);
5531     return reg_off;
5532 }
5533 
5534 /*
5535  * Resolve the guest virtual address to info->host and info->flags.
5536  * If @nofault, return false if the page is invalid, otherwise
5537  * exit via page fault exception.
5538  */
5539 
5540 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5541                     target_ulong addr, int mem_off, MMUAccessType access_type,
5542                     int mmu_idx, uintptr_t retaddr)
5543 {
5544     int flags;
5545 
5546     addr += mem_off;
5547 
5548     /*
5549      * User-only currently always issues with TBI.  See the comment
5550      * above useronly_clean_ptr.  Usually we clean this top byte away
5551      * during translation, but we can't do that for e.g. vector + imm
5552      * addressing modes.
5553      *
5554      * We currently always enable TBI for user-only, and do not provide
5555      * a way to turn it off.  So clean the pointer unconditionally here,
5556      * rather than look it up here, or pass it down from above.
5557      */
5558     addr = useronly_clean_ptr(addr);
5559 
5560 #ifdef CONFIG_USER_ONLY
5561     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5562                                &info->host, retaddr);
5563 #else
5564     CPUTLBEntryFull *full;
5565     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5566                               &info->host, &full, retaddr);
5567 #endif
5568     info->flags = flags;
5569 
5570     if (flags & TLB_INVALID_MASK) {
5571         g_assert(nofault);
5572         return false;
5573     }
5574 
5575 #ifdef CONFIG_USER_ONLY
5576     memset(&info->attrs, 0, sizeof(info->attrs));
5577     /* Require both ANON and MTE; see allocation_tag_mem(). */
5578     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5579 #else
5580     info->attrs = full->attrs;
5581     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5582 #endif
5583 
5584     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5585     info->host -= mem_off;
5586     return true;
5587 }
5588 
5589 /*
5590  * Find first active element on each page, and a loose bound for the
5591  * final element on each page.  Identify any single element that spans
5592  * the page boundary.  Return true if there are any active elements.
5593  */
5594 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5595                             intptr_t reg_max, int esz, int msize)
5596 {
5597     const int esize = 1 << esz;
5598     const uint64_t pg_mask = pred_esz_masks[esz];
5599     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5600     intptr_t mem_off_last, mem_off_split;
5601     intptr_t page_split, elt_split;
5602     intptr_t i;
5603 
5604     /* Set all of the element indices to -1, and the TLB data to 0. */
5605     memset(info, -1, offsetof(SVEContLdSt, page));
5606     memset(info->page, 0, sizeof(info->page));
5607 
5608     /* Gross scan over the entire predicate to find bounds. */
5609     i = 0;
5610     do {
5611         uint64_t pg = vg[i] & pg_mask;
5612         if (pg) {
5613             reg_off_last = i * 64 + 63 - clz64(pg);
5614             if (reg_off_first < 0) {
5615                 reg_off_first = i * 64 + ctz64(pg);
5616             }
5617         }
5618     } while (++i * 64 < reg_max);
5619 
5620     if (unlikely(reg_off_first < 0)) {
5621         /* No active elements, no pages touched. */
5622         return false;
5623     }
5624     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5625 
5626     info->reg_off_first[0] = reg_off_first;
5627     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5628     mem_off_last = (reg_off_last >> esz) * msize;
5629 
5630     page_split = -(addr | TARGET_PAGE_MASK);
5631     if (likely(mem_off_last + msize <= page_split)) {
5632         /* The entire operation fits within a single page. */
5633         info->reg_off_last[0] = reg_off_last;
5634         return true;
5635     }
5636 
5637     info->page_split = page_split;
5638     elt_split = page_split / msize;
5639     reg_off_split = elt_split << esz;
5640     mem_off_split = elt_split * msize;
5641 
5642     /*
5643      * This is the last full element on the first page, but it is not
5644      * necessarily active.  If there is no full element, i.e. the first
5645      * active element is the one that's split, this value remains -1.
5646      * It is useful as iteration bounds.
5647      */
5648     if (elt_split != 0) {
5649         info->reg_off_last[0] = reg_off_split - esize;
5650     }
5651 
5652     /* Determine if an unaligned element spans the pages.  */
5653     if (page_split % msize != 0) {
5654         /* It is helpful to know if the split element is active. */
5655         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5656             info->reg_off_split = reg_off_split;
5657             info->mem_off_split = mem_off_split;
5658 
5659             if (reg_off_split == reg_off_last) {
5660                 /* The page crossing element is last. */
5661                 return true;
5662             }
5663         }
5664         reg_off_split += esize;
5665         mem_off_split += msize;
5666     }
5667 
5668     /*
5669      * We do want the first active element on the second page, because
5670      * this may affect the address reported in an exception.
5671      */
5672     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5673     tcg_debug_assert(reg_off_split <= reg_off_last);
5674     info->reg_off_first[1] = reg_off_split;
5675     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5676     info->reg_off_last[1] = reg_off_last;
5677     return true;
5678 }
5679 
5680 /*
5681  * Resolve the guest virtual addresses to info->page[].
5682  * Control the generation of page faults with @fault.  Return false if
5683  * there is no work to do, which can only happen with @fault == FAULT_NO.
5684  */
5685 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5686                          CPUARMState *env, target_ulong addr,
5687                          MMUAccessType access_type, uintptr_t retaddr)
5688 {
5689     int mmu_idx = arm_env_mmu_index(env);
5690     int mem_off = info->mem_off_first[0];
5691     bool nofault = fault == FAULT_NO;
5692     bool have_work = true;
5693 
5694     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5695                         access_type, mmu_idx, retaddr)) {
5696         /* No work to be done. */
5697         return false;
5698     }
5699 
5700     if (likely(info->page_split < 0)) {
5701         /* The entire operation was on the one page. */
5702         return true;
5703     }
5704 
5705     /*
5706      * If the second page is invalid, then we want the fault address to be
5707      * the first byte on that page which is accessed.
5708      */
5709     if (info->mem_off_split >= 0) {
5710         /*
5711          * There is an element split across the pages.  The fault address
5712          * should be the first byte of the second page.
5713          */
5714         mem_off = info->page_split;
5715         /*
5716          * If the split element is also the first active element
5717          * of the vector, then:  For first-fault we should continue
5718          * to generate faults for the second page.  For no-fault,
5719          * we have work only if the second page is valid.
5720          */
5721         if (info->mem_off_first[0] < info->mem_off_split) {
5722             nofault = FAULT_FIRST;
5723             have_work = false;
5724         }
5725     } else {
5726         /*
5727          * There is no element split across the pages.  The fault address
5728          * should be the first active element on the second page.
5729          */
5730         mem_off = info->mem_off_first[1];
5731         /*
5732          * There must have been one active element on the first page,
5733          * so we're out of first-fault territory.
5734          */
5735         nofault = fault != FAULT_ALL;
5736     }
5737 
5738     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5739                                 access_type, mmu_idx, retaddr);
5740     return have_work;
5741 }
5742 
5743 #ifndef CONFIG_USER_ONLY
5744 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5745                                uint64_t *vg, target_ulong addr,
5746                                int esize, int msize, int wp_access,
5747                                uintptr_t retaddr)
5748 {
5749     intptr_t mem_off, reg_off, reg_last;
5750     int flags0 = info->page[0].flags;
5751     int flags1 = info->page[1].flags;
5752 
5753     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5754         return;
5755     }
5756 
5757     /* Indicate that watchpoints are handled. */
5758     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5759     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5760 
5761     if (flags0 & TLB_WATCHPOINT) {
5762         mem_off = info->mem_off_first[0];
5763         reg_off = info->reg_off_first[0];
5764         reg_last = info->reg_off_last[0];
5765 
5766         while (reg_off <= reg_last) {
5767             uint64_t pg = vg[reg_off >> 6];
5768             do {
5769                 if ((pg >> (reg_off & 63)) & 1) {
5770                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5771                                          msize, info->page[0].attrs,
5772                                          wp_access, retaddr);
5773                 }
5774                 reg_off += esize;
5775                 mem_off += msize;
5776             } while (reg_off <= reg_last && (reg_off & 63));
5777         }
5778     }
5779 
5780     mem_off = info->mem_off_split;
5781     if (mem_off >= 0) {
5782         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5783                              info->page[0].attrs, wp_access, retaddr);
5784     }
5785 
5786     mem_off = info->mem_off_first[1];
5787     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5788         reg_off = info->reg_off_first[1];
5789         reg_last = info->reg_off_last[1];
5790 
5791         do {
5792             uint64_t pg = vg[reg_off >> 6];
5793             do {
5794                 if ((pg >> (reg_off & 63)) & 1) {
5795                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5796                                          msize, info->page[1].attrs,
5797                                          wp_access, retaddr);
5798                 }
5799                 reg_off += esize;
5800                 mem_off += msize;
5801             } while (reg_off & 63);
5802         } while (reg_off <= reg_last);
5803     }
5804 }
5805 #endif
5806 
5807 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5808                              uint64_t *vg, target_ulong addr, int esize,
5809                              int msize, uint32_t mtedesc, uintptr_t ra)
5810 {
5811     intptr_t mem_off, reg_off, reg_last;
5812 
5813     /* Process the page only if MemAttr == Tagged. */
5814     if (info->page[0].tagged) {
5815         mem_off = info->mem_off_first[0];
5816         reg_off = info->reg_off_first[0];
5817         reg_last = info->reg_off_split;
5818         if (reg_last < 0) {
5819             reg_last = info->reg_off_last[0];
5820         }
5821 
5822         do {
5823             uint64_t pg = vg[reg_off >> 6];
5824             do {
5825                 if ((pg >> (reg_off & 63)) & 1) {
5826                     mte_check(env, mtedesc, addr, ra);
5827                 }
5828                 reg_off += esize;
5829                 mem_off += msize;
5830             } while (reg_off <= reg_last && (reg_off & 63));
5831         } while (reg_off <= reg_last);
5832     }
5833 
5834     mem_off = info->mem_off_first[1];
5835     if (mem_off >= 0 && info->page[1].tagged) {
5836         reg_off = info->reg_off_first[1];
5837         reg_last = info->reg_off_last[1];
5838 
5839         do {
5840             uint64_t pg = vg[reg_off >> 6];
5841             do {
5842                 if ((pg >> (reg_off & 63)) & 1) {
5843                     mte_check(env, mtedesc, addr, ra);
5844                 }
5845                 reg_off += esize;
5846                 mem_off += msize;
5847             } while (reg_off & 63);
5848         } while (reg_off <= reg_last);
5849     }
5850 }
5851 
5852 /*
5853  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5854  */
5855 static inline QEMU_ALWAYS_INLINE
5856 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5857                uint32_t desc, const uintptr_t retaddr,
5858                const int esz, const int msz, const int N, uint32_t mtedesc,
5859                sve_ldst1_host_fn *host_fn,
5860                sve_ldst1_tlb_fn *tlb_fn)
5861 {
5862     const unsigned rd = simd_data(desc);
5863     const intptr_t reg_max = simd_oprsz(desc);
5864     intptr_t reg_off, reg_last, mem_off;
5865     SVEContLdSt info;
5866     void *host;
5867     int flags, i;
5868 
5869     /* Find the active elements.  */
5870     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5871         /* The entire predicate was false; no load occurs.  */
5872         for (i = 0; i < N; ++i) {
5873             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5874         }
5875         return;
5876     }
5877 
5878     /* Probe the page(s).  Exit with exception for any invalid page. */
5879     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5880 
5881     /* Handle watchpoints for all active elements. */
5882     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5883                               BP_MEM_READ, retaddr);
5884 
5885     /*
5886      * Handle mte checks for all active elements.
5887      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5888      */
5889     if (mtedesc) {
5890         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5891                                 mtedesc, retaddr);
5892     }
5893 
5894     flags = info.page[0].flags | info.page[1].flags;
5895     if (unlikely(flags != 0)) {
5896         /*
5897          * At least one page includes MMIO.
5898          * Any bus operation can fail with cpu_transaction_failed,
5899          * which for ARM will raise SyncExternal.  Perform the load
5900          * into scratch memory to preserve register state until the end.
5901          */
5902         ARMVectorReg scratch[4] = { };
5903 
5904         mem_off = info.mem_off_first[0];
5905         reg_off = info.reg_off_first[0];
5906         reg_last = info.reg_off_last[1];
5907         if (reg_last < 0) {
5908             reg_last = info.reg_off_split;
5909             if (reg_last < 0) {
5910                 reg_last = info.reg_off_last[0];
5911             }
5912         }
5913 
5914         do {
5915             uint64_t pg = vg[reg_off >> 6];
5916             do {
5917                 if ((pg >> (reg_off & 63)) & 1) {
5918                     for (i = 0; i < N; ++i) {
5919                         tlb_fn(env, &scratch[i], reg_off,
5920                                addr + mem_off + (i << msz), retaddr);
5921                     }
5922                 }
5923                 reg_off += 1 << esz;
5924                 mem_off += N << msz;
5925             } while (reg_off & 63);
5926         } while (reg_off <= reg_last);
5927 
5928         for (i = 0; i < N; ++i) {
5929             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5930         }
5931         return;
5932     }
5933 
5934     /* The entire operation is in RAM, on valid pages. */
5935 
5936     for (i = 0; i < N; ++i) {
5937         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5938     }
5939 
5940     mem_off = info.mem_off_first[0];
5941     reg_off = info.reg_off_first[0];
5942     reg_last = info.reg_off_last[0];
5943     host = info.page[0].host;
5944 
5945     set_helper_retaddr(retaddr);
5946 
5947     while (reg_off <= reg_last) {
5948         uint64_t pg = vg[reg_off >> 6];
5949         do {
5950             if ((pg >> (reg_off & 63)) & 1) {
5951                 for (i = 0; i < N; ++i) {
5952                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5953                             host + mem_off + (i << msz));
5954                 }
5955             }
5956             reg_off += 1 << esz;
5957             mem_off += N << msz;
5958         } while (reg_off <= reg_last && (reg_off & 63));
5959     }
5960 
5961     clear_helper_retaddr();
5962 
5963     /*
5964      * Use the slow path to manage the cross-page misalignment.
5965      * But we know this is RAM and cannot trap.
5966      */
5967     mem_off = info.mem_off_split;
5968     if (unlikely(mem_off >= 0)) {
5969         reg_off = info.reg_off_split;
5970         for (i = 0; i < N; ++i) {
5971             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5972                    addr + mem_off + (i << msz), retaddr);
5973         }
5974     }
5975 
5976     mem_off = info.mem_off_first[1];
5977     if (unlikely(mem_off >= 0)) {
5978         reg_off = info.reg_off_first[1];
5979         reg_last = info.reg_off_last[1];
5980         host = info.page[1].host;
5981 
5982         set_helper_retaddr(retaddr);
5983 
5984         do {
5985             uint64_t pg = vg[reg_off >> 6];
5986             do {
5987                 if ((pg >> (reg_off & 63)) & 1) {
5988                     for (i = 0; i < N; ++i) {
5989                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5990                                 host + mem_off + (i << msz));
5991                     }
5992                 }
5993                 reg_off += 1 << esz;
5994                 mem_off += N << msz;
5995             } while (reg_off & 63);
5996         } while (reg_off <= reg_last);
5997 
5998         clear_helper_retaddr();
5999     }
6000 }
6001 
6002 static inline QEMU_ALWAYS_INLINE
6003 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6004                    uint32_t desc, const uintptr_t ra,
6005                    const int esz, const int msz, const int N,
6006                    sve_ldst1_host_fn *host_fn,
6007                    sve_ldst1_tlb_fn *tlb_fn)
6008 {
6009     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6010     int bit55 = extract64(addr, 55, 1);
6011 
6012     /* Remove mtedesc from the normal sve descriptor. */
6013     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6014 
6015     /* Perform gross MTE suppression early. */
6016     if (!tbi_check(mtedesc, bit55) ||
6017         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6018         mtedesc = 0;
6019     }
6020 
6021     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6022 }
6023 
6024 #define DO_LD1_1(NAME, ESZ)                                             \
6025 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
6026                             target_ulong addr, uint32_t desc)           \
6027 {                                                                       \
6028     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
6029               sve_##NAME##_host, sve_##NAME##_tlb);                     \
6030 }                                                                       \
6031 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
6032                                 target_ulong addr, uint32_t desc)       \
6033 {                                                                       \
6034     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
6035                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
6036 }
6037 
6038 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
6039 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
6040                                target_ulong addr, uint32_t desc)        \
6041 {                                                                       \
6042     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6043               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
6044 }                                                                       \
6045 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
6046                                target_ulong addr, uint32_t desc)        \
6047 {                                                                       \
6048     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
6049               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
6050 }                                                                       \
6051 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
6052                                    target_ulong addr, uint32_t desc)    \
6053 {                                                                       \
6054     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6055                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
6056 }                                                                       \
6057 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
6058                                    target_ulong addr, uint32_t desc)    \
6059 {                                                                       \
6060     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
6061                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
6062 }
6063 
6064 DO_LD1_1(ld1bb,  MO_8)
6065 DO_LD1_1(ld1bhu, MO_16)
6066 DO_LD1_1(ld1bhs, MO_16)
6067 DO_LD1_1(ld1bsu, MO_32)
6068 DO_LD1_1(ld1bss, MO_32)
6069 DO_LD1_1(ld1bdu, MO_64)
6070 DO_LD1_1(ld1bds, MO_64)
6071 
6072 DO_LD1_2(ld1hh,  MO_16, MO_16)
6073 DO_LD1_2(ld1hsu, MO_32, MO_16)
6074 DO_LD1_2(ld1hss, MO_32, MO_16)
6075 DO_LD1_2(ld1hdu, MO_64, MO_16)
6076 DO_LD1_2(ld1hds, MO_64, MO_16)
6077 
6078 DO_LD1_2(ld1ss,  MO_32, MO_32)
6079 DO_LD1_2(ld1sdu, MO_64, MO_32)
6080 DO_LD1_2(ld1sds, MO_64, MO_32)
6081 
6082 DO_LD1_2(ld1dd,  MO_64, MO_64)
6083 
6084 #undef DO_LD1_1
6085 #undef DO_LD1_2
6086 
6087 #define DO_LDN_1(N)                                                     \
6088 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
6089                              target_ulong addr, uint32_t desc)          \
6090 {                                                                       \
6091     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
6092               sve_ld1bb_host, sve_ld1bb_tlb);                           \
6093 }                                                                       \
6094 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
6095                                  target_ulong addr, uint32_t desc)      \
6096 {                                                                       \
6097     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
6098                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
6099 }
6100 
6101 #define DO_LDN_2(N, SUFF, ESZ)                                          \
6102 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
6103                                     target_ulong addr, uint32_t desc)   \
6104 {                                                                       \
6105     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6106               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
6107 }                                                                       \
6108 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
6109                                     target_ulong addr, uint32_t desc)   \
6110 {                                                                       \
6111     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
6112               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
6113 }                                                                       \
6114 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
6115                                         target_ulong addr, uint32_t desc) \
6116 {                                                                       \
6117     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6118                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
6119 }                                                                       \
6120 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
6121                                         target_ulong addr, uint32_t desc) \
6122 {                                                                       \
6123     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
6124                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
6125 }
6126 
6127 DO_LDN_1(2)
6128 DO_LDN_1(3)
6129 DO_LDN_1(4)
6130 
6131 DO_LDN_2(2, hh, MO_16)
6132 DO_LDN_2(3, hh, MO_16)
6133 DO_LDN_2(4, hh, MO_16)
6134 
6135 DO_LDN_2(2, ss, MO_32)
6136 DO_LDN_2(3, ss, MO_32)
6137 DO_LDN_2(4, ss, MO_32)
6138 
6139 DO_LDN_2(2, dd, MO_64)
6140 DO_LDN_2(3, dd, MO_64)
6141 DO_LDN_2(4, dd, MO_64)
6142 
6143 #undef DO_LDN_1
6144 #undef DO_LDN_2
6145 
6146 /*
6147  * Load contiguous data, first-fault and no-fault.
6148  *
6149  * For user-only, we control the race between page_check_range and
6150  * another thread's munmap by using set/clear_helper_retaddr.  Any
6151  * SEGV that occurs between those markers is assumed to be because
6152  * the guest page vanished.  Keep that block as small as possible
6153  * so that unrelated QEMU bugs are not blamed on the guest.
6154  */
6155 
6156 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
6157  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6158  * option, which leaves subsequent data unchanged.
6159  */
6160 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
6161 {
6162     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
6163 
6164     if (i & 63) {
6165         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
6166         i = ROUND_UP(i, 64);
6167     }
6168     for (; i < oprsz; i += 64) {
6169         ffr[i / 64] = 0;
6170     }
6171 }
6172 
6173 /*
6174  * Common helper for all contiguous no-fault and first-fault loads.
6175  */
6176 static inline QEMU_ALWAYS_INLINE
6177 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
6178                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
6179                    const int esz, const int msz, const SVEContFault fault,
6180                    sve_ldst1_host_fn *host_fn,
6181                    sve_ldst1_tlb_fn *tlb_fn)
6182 {
6183     const unsigned rd = simd_data(desc);
6184     void *vd = &env->vfp.zregs[rd];
6185     const intptr_t reg_max = simd_oprsz(desc);
6186     intptr_t reg_off, mem_off, reg_last;
6187     SVEContLdSt info;
6188     int flags;
6189     void *host;
6190 
6191     /* Find the active elements.  */
6192     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
6193         /* The entire predicate was false; no load occurs.  */
6194         memset(vd, 0, reg_max);
6195         return;
6196     }
6197     reg_off = info.reg_off_first[0];
6198 
6199     /* Probe the page(s). */
6200     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6201         /* Fault on first element. */
6202         tcg_debug_assert(fault == FAULT_NO);
6203         memset(vd, 0, reg_max);
6204         goto do_fault;
6205     }
6206 
6207     mem_off = info.mem_off_first[0];
6208     flags = info.page[0].flags;
6209 
6210     /*
6211      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6212      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6213      */
6214     if (!info.page[0].tagged) {
6215         mtedesc = 0;
6216     }
6217 
6218     if (fault == FAULT_FIRST) {
6219         /* Trapping mte check for the first-fault element.  */
6220         if (mtedesc) {
6221             mte_check(env, mtedesc, addr + mem_off, retaddr);
6222         }
6223 
6224         /*
6225          * Special handling of the first active element,
6226          * if it crosses a page boundary or is MMIO.
6227          */
6228         bool is_split = mem_off == info.mem_off_split;
6229         if (unlikely(flags != 0) || unlikely(is_split)) {
6230             /*
6231              * Use the slow path for cross-page handling.
6232              * Might trap for MMIO or watchpoints.
6233              */
6234             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6235 
6236             /* After any fault, zero the other elements. */
6237             swap_memzero(vd, reg_off);
6238             reg_off += 1 << esz;
6239             mem_off += 1 << msz;
6240             swap_memzero(vd + reg_off, reg_max - reg_off);
6241 
6242             if (is_split) {
6243                 goto second_page;
6244             }
6245         } else {
6246             memset(vd, 0, reg_max);
6247         }
6248     } else {
6249         memset(vd, 0, reg_max);
6250         if (unlikely(mem_off == info.mem_off_split)) {
6251             /* The first active element crosses a page boundary. */
6252             flags |= info.page[1].flags;
6253             if (unlikely(flags & TLB_MMIO)) {
6254                 /* Some page is MMIO, see below. */
6255                 goto do_fault;
6256             }
6257             if (unlikely(flags & TLB_WATCHPOINT) &&
6258                 (cpu_watchpoint_address_matches
6259                  (env_cpu(env), addr + mem_off, 1 << msz)
6260                  & BP_MEM_READ)) {
6261                 /* Watchpoint hit, see below. */
6262                 goto do_fault;
6263             }
6264             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6265                 goto do_fault;
6266             }
6267             /*
6268              * Use the slow path for cross-page handling.
6269              * This is RAM, without a watchpoint, and will not trap.
6270              */
6271             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6272             goto second_page;
6273         }
6274     }
6275 
6276     /*
6277      * From this point on, all memory operations are MemSingleNF.
6278      *
6279      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6280      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6281      *
6282      * Unfortuately we do not have access to the memory attributes from the
6283      * PTE to tell Device memory from Normal memory.  So we make a mostly
6284      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6285      * This gives the right answer for the common cases of "Normal memory,
6286      * backed by host RAM" and "Device memory, backed by MMIO".
6287      * The architecture allows us to suppress an NF load and return
6288      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6289      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6290      * get wrong is "Device memory, backed by host RAM", for which we
6291      * should return (UNKNOWN, FAULT) for but do not.
6292      *
6293      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6294      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6295      * architectural breakpoints the same.
6296      */
6297     if (unlikely(flags & TLB_MMIO)) {
6298         goto do_fault;
6299     }
6300 
6301     reg_last = info.reg_off_last[0];
6302     host = info.page[0].host;
6303 
6304     set_helper_retaddr(retaddr);
6305 
6306     do {
6307         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6308         do {
6309             if ((pg >> (reg_off & 63)) & 1) {
6310                 if (unlikely(flags & TLB_WATCHPOINT) &&
6311                     (cpu_watchpoint_address_matches
6312                      (env_cpu(env), addr + mem_off, 1 << msz)
6313                      & BP_MEM_READ)) {
6314                     clear_helper_retaddr();
6315                     goto do_fault;
6316                 }
6317                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6318                     clear_helper_retaddr();
6319                     goto do_fault;
6320                 }
6321                 host_fn(vd, reg_off, host + mem_off);
6322             }
6323             reg_off += 1 << esz;
6324             mem_off += 1 << msz;
6325         } while (reg_off <= reg_last && (reg_off & 63));
6326     } while (reg_off <= reg_last);
6327 
6328     clear_helper_retaddr();
6329 
6330     /*
6331      * MemSingleNF is allowed to fail for any reason.  We have special
6332      * code above to handle the first element crossing a page boundary.
6333      * As an implementation choice, decline to handle a cross-page element
6334      * in any other position.
6335      */
6336     reg_off = info.reg_off_split;
6337     if (reg_off >= 0) {
6338         goto do_fault;
6339     }
6340 
6341  second_page:
6342     reg_off = info.reg_off_first[1];
6343     if (likely(reg_off < 0)) {
6344         /* No active elements on the second page.  All done. */
6345         return;
6346     }
6347 
6348     /*
6349      * MemSingleNF is allowed to fail for any reason.  As an implementation
6350      * choice, decline to handle elements on the second page.  This should
6351      * be low frequency as the guest walks through memory -- the next
6352      * iteration of the guest's loop should be aligned on the page boundary,
6353      * and then all following iterations will stay aligned.
6354      */
6355 
6356  do_fault:
6357     record_fault(env, reg_off, reg_max);
6358 }
6359 
6360 static inline QEMU_ALWAYS_INLINE
6361 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6362                        uint32_t desc, const uintptr_t retaddr,
6363                        const int esz, const int msz, const SVEContFault fault,
6364                        sve_ldst1_host_fn *host_fn,
6365                        sve_ldst1_tlb_fn *tlb_fn)
6366 {
6367     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6368     int bit55 = extract64(addr, 55, 1);
6369 
6370     /* Remove mtedesc from the normal sve descriptor. */
6371     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6372 
6373     /* Perform gross MTE suppression early. */
6374     if (!tbi_check(mtedesc, bit55) ||
6375         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6376         mtedesc = 0;
6377     }
6378 
6379     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6380                   esz, msz, fault, host_fn, tlb_fn);
6381 }
6382 
6383 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6384 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6385                                  target_ulong addr, uint32_t desc)      \
6386 {                                                                       \
6387     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6388                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6389 }                                                                       \
6390 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6391                                  target_ulong addr, uint32_t desc)      \
6392 {                                                                       \
6393     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6394                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6395 }                                                                       \
6396 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6397                                      target_ulong addr, uint32_t desc)  \
6398 {                                                                       \
6399     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6400                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6401 }                                                                       \
6402 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6403                                      target_ulong addr, uint32_t desc)  \
6404 {                                                                       \
6405     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6406                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6407 }
6408 
6409 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6410 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6411                                     target_ulong addr, uint32_t desc)   \
6412 {                                                                       \
6413     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6414                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6415 }                                                                       \
6416 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6417                                     target_ulong addr, uint32_t desc)   \
6418 {                                                                       \
6419     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6420                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6421 }                                                                       \
6422 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6423                                     target_ulong addr, uint32_t desc)   \
6424 {                                                                       \
6425     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6426                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6427 }                                                                       \
6428 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6429                                     target_ulong addr, uint32_t desc)   \
6430 {                                                                       \
6431     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6432                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6433 }                                                                       \
6434 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6435                                         target_ulong addr, uint32_t desc) \
6436 {                                                                       \
6437     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6438                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6439 }                                                                       \
6440 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6441                                         target_ulong addr, uint32_t desc) \
6442 {                                                                       \
6443     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6444                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6445 }                                                                       \
6446 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6447                                         target_ulong addr, uint32_t desc) \
6448 {                                                                       \
6449     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6450                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6451 }                                                                       \
6452 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6453                                         target_ulong addr, uint32_t desc) \
6454 {                                                                       \
6455     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6456                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6457 }
6458 
6459 DO_LDFF1_LDNF1_1(bb,  MO_8)
6460 DO_LDFF1_LDNF1_1(bhu, MO_16)
6461 DO_LDFF1_LDNF1_1(bhs, MO_16)
6462 DO_LDFF1_LDNF1_1(bsu, MO_32)
6463 DO_LDFF1_LDNF1_1(bss, MO_32)
6464 DO_LDFF1_LDNF1_1(bdu, MO_64)
6465 DO_LDFF1_LDNF1_1(bds, MO_64)
6466 
6467 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6468 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6469 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6470 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6471 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6472 
6473 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6474 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6475 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6476 
6477 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6478 
6479 #undef DO_LDFF1_LDNF1_1
6480 #undef DO_LDFF1_LDNF1_2
6481 
6482 /*
6483  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6484  */
6485 
6486 static inline QEMU_ALWAYS_INLINE
6487 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6488                uint32_t desc, const uintptr_t retaddr,
6489                const int esz, const int msz, const int N, uint32_t mtedesc,
6490                sve_ldst1_host_fn *host_fn,
6491                sve_ldst1_tlb_fn *tlb_fn)
6492 {
6493     const unsigned rd = simd_data(desc);
6494     const intptr_t reg_max = simd_oprsz(desc);
6495     intptr_t reg_off, reg_last, mem_off;
6496     SVEContLdSt info;
6497     void *host;
6498     int i, flags;
6499 
6500     /* Find the active elements.  */
6501     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6502         /* The entire predicate was false; no store occurs.  */
6503         return;
6504     }
6505 
6506     /* Probe the page(s).  Exit with exception for any invalid page. */
6507     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6508 
6509     /* Handle watchpoints for all active elements. */
6510     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6511                               BP_MEM_WRITE, retaddr);
6512 
6513     /*
6514      * Handle mte checks for all active elements.
6515      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6516      */
6517     if (mtedesc) {
6518         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6519                                 mtedesc, retaddr);
6520     }
6521 
6522     flags = info.page[0].flags | info.page[1].flags;
6523     if (unlikely(flags != 0)) {
6524         /*
6525          * At least one page includes MMIO.
6526          * Any bus operation can fail with cpu_transaction_failed,
6527          * which for ARM will raise SyncExternal.  We cannot avoid
6528          * this fault and will leave with the store incomplete.
6529          */
6530         mem_off = info.mem_off_first[0];
6531         reg_off = info.reg_off_first[0];
6532         reg_last = info.reg_off_last[1];
6533         if (reg_last < 0) {
6534             reg_last = info.reg_off_split;
6535             if (reg_last < 0) {
6536                 reg_last = info.reg_off_last[0];
6537             }
6538         }
6539 
6540         do {
6541             uint64_t pg = vg[reg_off >> 6];
6542             do {
6543                 if ((pg >> (reg_off & 63)) & 1) {
6544                     for (i = 0; i < N; ++i) {
6545                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6546                                addr + mem_off + (i << msz), retaddr);
6547                     }
6548                 }
6549                 reg_off += 1 << esz;
6550                 mem_off += N << msz;
6551             } while (reg_off & 63);
6552         } while (reg_off <= reg_last);
6553         return;
6554     }
6555 
6556     mem_off = info.mem_off_first[0];
6557     reg_off = info.reg_off_first[0];
6558     reg_last = info.reg_off_last[0];
6559     host = info.page[0].host;
6560 
6561     set_helper_retaddr(retaddr);
6562 
6563     while (reg_off <= reg_last) {
6564         uint64_t pg = vg[reg_off >> 6];
6565         do {
6566             if ((pg >> (reg_off & 63)) & 1) {
6567                 for (i = 0; i < N; ++i) {
6568                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6569                             host + mem_off + (i << msz));
6570                 }
6571             }
6572             reg_off += 1 << esz;
6573             mem_off += N << msz;
6574         } while (reg_off <= reg_last && (reg_off & 63));
6575     }
6576 
6577     clear_helper_retaddr();
6578 
6579     /*
6580      * Use the slow path to manage the cross-page misalignment.
6581      * But we know this is RAM and cannot trap.
6582      */
6583     mem_off = info.mem_off_split;
6584     if (unlikely(mem_off >= 0)) {
6585         reg_off = info.reg_off_split;
6586         for (i = 0; i < N; ++i) {
6587             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6588                    addr + mem_off + (i << msz), retaddr);
6589         }
6590     }
6591 
6592     mem_off = info.mem_off_first[1];
6593     if (unlikely(mem_off >= 0)) {
6594         reg_off = info.reg_off_first[1];
6595         reg_last = info.reg_off_last[1];
6596         host = info.page[1].host;
6597 
6598         set_helper_retaddr(retaddr);
6599 
6600         do {
6601             uint64_t pg = vg[reg_off >> 6];
6602             do {
6603                 if ((pg >> (reg_off & 63)) & 1) {
6604                     for (i = 0; i < N; ++i) {
6605                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6606                                 host + mem_off + (i << msz));
6607                     }
6608                 }
6609                 reg_off += 1 << esz;
6610                 mem_off += N << msz;
6611             } while (reg_off & 63);
6612         } while (reg_off <= reg_last);
6613 
6614         clear_helper_retaddr();
6615     }
6616 }
6617 
6618 static inline QEMU_ALWAYS_INLINE
6619 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6620                    uint32_t desc, const uintptr_t ra,
6621                    const int esz, const int msz, const int N,
6622                    sve_ldst1_host_fn *host_fn,
6623                    sve_ldst1_tlb_fn *tlb_fn)
6624 {
6625     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6626     int bit55 = extract64(addr, 55, 1);
6627 
6628     /* Remove mtedesc from the normal sve descriptor. */
6629     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6630 
6631     /* Perform gross MTE suppression early. */
6632     if (!tbi_check(mtedesc, bit55) ||
6633         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6634         mtedesc = 0;
6635     }
6636 
6637     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6638 }
6639 
6640 #define DO_STN_1(N, NAME, ESZ)                                          \
6641 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6642                                  target_ulong addr, uint32_t desc)      \
6643 {                                                                       \
6644     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6645               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6646 }                                                                       \
6647 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6648                                      target_ulong addr, uint32_t desc)  \
6649 {                                                                       \
6650     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6651                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6652 }
6653 
6654 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6655 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6656                                     target_ulong addr, uint32_t desc)   \
6657 {                                                                       \
6658     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6659               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6660 }                                                                       \
6661 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6662                                     target_ulong addr, uint32_t desc)   \
6663 {                                                                       \
6664     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6665               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6666 }                                                                       \
6667 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6668                                         target_ulong addr, uint32_t desc) \
6669 {                                                                       \
6670     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6671                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6672 }                                                                       \
6673 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6674                                         target_ulong addr, uint32_t desc) \
6675 {                                                                       \
6676     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6677                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6678 }
6679 
6680 DO_STN_1(1, bb, MO_8)
6681 DO_STN_1(1, bh, MO_16)
6682 DO_STN_1(1, bs, MO_32)
6683 DO_STN_1(1, bd, MO_64)
6684 DO_STN_1(2, bb, MO_8)
6685 DO_STN_1(3, bb, MO_8)
6686 DO_STN_1(4, bb, MO_8)
6687 
6688 DO_STN_2(1, hh, MO_16, MO_16)
6689 DO_STN_2(1, hs, MO_32, MO_16)
6690 DO_STN_2(1, hd, MO_64, MO_16)
6691 DO_STN_2(2, hh, MO_16, MO_16)
6692 DO_STN_2(3, hh, MO_16, MO_16)
6693 DO_STN_2(4, hh, MO_16, MO_16)
6694 
6695 DO_STN_2(1, ss, MO_32, MO_32)
6696 DO_STN_2(1, sd, MO_64, MO_32)
6697 DO_STN_2(2, ss, MO_32, MO_32)
6698 DO_STN_2(3, ss, MO_32, MO_32)
6699 DO_STN_2(4, ss, MO_32, MO_32)
6700 
6701 DO_STN_2(1, dd, MO_64, MO_64)
6702 DO_STN_2(2, dd, MO_64, MO_64)
6703 DO_STN_2(3, dd, MO_64, MO_64)
6704 DO_STN_2(4, dd, MO_64, MO_64)
6705 
6706 #undef DO_STN_1
6707 #undef DO_STN_2
6708 
6709 /*
6710  * Loads with a vector index.
6711  */
6712 
6713 /*
6714  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6715  */
6716 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6717 
6718 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6719 {
6720     return *(uint32_t *)(reg + H1_4(reg_ofs));
6721 }
6722 
6723 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6724 {
6725     return *(int32_t *)(reg + H1_4(reg_ofs));
6726 }
6727 
6728 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6729 {
6730     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6731 }
6732 
6733 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6734 {
6735     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6736 }
6737 
6738 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6739 {
6740     return *(uint64_t *)(reg + reg_ofs);
6741 }
6742 
6743 static inline QEMU_ALWAYS_INLINE
6744 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6745                target_ulong base, uint32_t desc, uintptr_t retaddr,
6746                uint32_t mtedesc, int esize, int msize,
6747                zreg_off_fn *off_fn,
6748                sve_ldst1_host_fn *host_fn,
6749                sve_ldst1_tlb_fn *tlb_fn)
6750 {
6751     const int mmu_idx = arm_env_mmu_index(env);
6752     const intptr_t reg_max = simd_oprsz(desc);
6753     const int scale = simd_data(desc);
6754     ARMVectorReg scratch;
6755     intptr_t reg_off;
6756     SVEHostPage info, info2;
6757 
6758     memset(&scratch, 0, reg_max);
6759     reg_off = 0;
6760     do {
6761         uint64_t pg = vg[reg_off >> 6];
6762         do {
6763             if (likely(pg & 1)) {
6764                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6765                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6766 
6767                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6768                                mmu_idx, retaddr);
6769 
6770                 if (likely(in_page >= msize)) {
6771                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6772                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6773                                              info.attrs, BP_MEM_READ, retaddr);
6774                     }
6775                     if (mtedesc && info.tagged) {
6776                         mte_check(env, mtedesc, addr, retaddr);
6777                     }
6778                     if (unlikely(info.flags & TLB_MMIO)) {
6779                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6780                     } else {
6781                         set_helper_retaddr(retaddr);
6782                         host_fn(&scratch, reg_off, info.host);
6783                         clear_helper_retaddr();
6784                     }
6785                 } else {
6786                     /* Element crosses the page boundary. */
6787                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6788                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6789                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6790                         cpu_check_watchpoint(env_cpu(env), addr,
6791                                              msize, info.attrs,
6792                                              BP_MEM_READ, retaddr);
6793                     }
6794                     if (mtedesc && info.tagged) {
6795                         mte_check(env, mtedesc, addr, retaddr);
6796                     }
6797                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6798                 }
6799             }
6800             reg_off += esize;
6801             pg >>= esize;
6802         } while (reg_off & 63);
6803     } while (reg_off < reg_max);
6804 
6805     /* Wait until all exceptions have been raised to write back.  */
6806     memcpy(vd, &scratch, reg_max);
6807 }
6808 
6809 static inline QEMU_ALWAYS_INLINE
6810 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6811                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6812                    int esize, int msize, zreg_off_fn *off_fn,
6813                    sve_ldst1_host_fn *host_fn,
6814                    sve_ldst1_tlb_fn *tlb_fn)
6815 {
6816     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6817     /* Remove mtedesc from the normal sve descriptor. */
6818     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6819 
6820     /*
6821      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6822      * offset base entirely over the address space hole to change the
6823      * pointer tag, or change the bit55 selector.  So we could here
6824      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6825      */
6826     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6827               esize, msize, off_fn, host_fn, tlb_fn);
6828 }
6829 
6830 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6831 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6832                                  void *vm, target_ulong base, uint32_t desc) \
6833 {                                                                            \
6834     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6835               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6836 }                                                                            \
6837 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6838      void *vm, target_ulong base, uint32_t desc)                             \
6839 {                                                                            \
6840     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6841                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6842 }
6843 
6844 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6845 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6846                                  void *vm, target_ulong base, uint32_t desc) \
6847 {                                                                            \
6848     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6849               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6850 }                                                                            \
6851 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6852     void *vm, target_ulong base, uint32_t desc)                              \
6853 {                                                                            \
6854     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6855                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6856 }
6857 
6858 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6859 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6860 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6861 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6862 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6863 
6864 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6865 DO_LD1_ZPZ_S(bss, zss, MO_8)
6866 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6867 DO_LD1_ZPZ_D(bds, zss, MO_8)
6868 DO_LD1_ZPZ_D(bds, zd, MO_8)
6869 
6870 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6871 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6872 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6873 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6874 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6875 
6876 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6877 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6878 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6879 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6880 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6881 
6882 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6883 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6884 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6885 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6886 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6887 
6888 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6889 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6890 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6891 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6892 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6893 
6894 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6895 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6896 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6897 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6898 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6899 
6900 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6901 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6902 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6903 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6904 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6905 
6906 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6907 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6908 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6909 
6910 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6911 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6912 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6913 
6914 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6915 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6916 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6917 
6918 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6919 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6920 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6921 
6922 #undef DO_LD1_ZPZ_S
6923 #undef DO_LD1_ZPZ_D
6924 
6925 /* First fault loads with a vector index.  */
6926 
6927 /*
6928  * Common helpers for all gather first-faulting loads.
6929  */
6930 
6931 static inline QEMU_ALWAYS_INLINE
6932 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6933                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6934                  uint32_t mtedesc, const int esz, const int msz,
6935                  zreg_off_fn *off_fn,
6936                  sve_ldst1_host_fn *host_fn,
6937                  sve_ldst1_tlb_fn *tlb_fn)
6938 {
6939     const int mmu_idx = arm_env_mmu_index(env);
6940     const intptr_t reg_max = simd_oprsz(desc);
6941     const int scale = simd_data(desc);
6942     const int esize = 1 << esz;
6943     const int msize = 1 << msz;
6944     intptr_t reg_off;
6945     SVEHostPage info;
6946     target_ulong addr, in_page;
6947     ARMVectorReg scratch;
6948 
6949     /* Skip to the first true predicate.  */
6950     reg_off = find_next_active(vg, 0, reg_max, esz);
6951     if (unlikely(reg_off >= reg_max)) {
6952         /* The entire predicate was false; no load occurs.  */
6953         memset(vd, 0, reg_max);
6954         return;
6955     }
6956 
6957     /* Protect against overlap between vd and vm. */
6958     if (unlikely(vd == vm)) {
6959         vm = memcpy(&scratch, vm, reg_max);
6960     }
6961 
6962     /*
6963      * Probe the first element, allowing faults.
6964      */
6965     addr = base + (off_fn(vm, reg_off) << scale);
6966     if (mtedesc) {
6967         mte_check(env, mtedesc, addr, retaddr);
6968     }
6969     tlb_fn(env, vd, reg_off, addr, retaddr);
6970 
6971     /* After any fault, zero the other elements. */
6972     swap_memzero(vd, reg_off);
6973     reg_off += esize;
6974     swap_memzero(vd + reg_off, reg_max - reg_off);
6975 
6976     /*
6977      * Probe the remaining elements, not allowing faults.
6978      */
6979     while (reg_off < reg_max) {
6980         uint64_t pg = vg[reg_off >> 6];
6981         do {
6982             if (likely((pg >> (reg_off & 63)) & 1)) {
6983                 addr = base + (off_fn(vm, reg_off) << scale);
6984                 in_page = -(addr | TARGET_PAGE_MASK);
6985 
6986                 if (unlikely(in_page < msize)) {
6987                     /* Stop if the element crosses a page boundary. */
6988                     goto fault;
6989                 }
6990 
6991                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6992                                mmu_idx, retaddr);
6993                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6994                     goto fault;
6995                 }
6996                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6997                     (cpu_watchpoint_address_matches
6998                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6999                     goto fault;
7000                 }
7001                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
7002                     goto fault;
7003                 }
7004 
7005                 set_helper_retaddr(retaddr);
7006                 host_fn(vd, reg_off, info.host);
7007                 clear_helper_retaddr();
7008             }
7009             reg_off += esize;
7010         } while (reg_off & 63);
7011     }
7012     return;
7013 
7014  fault:
7015     record_fault(env, reg_off, reg_max);
7016 }
7017 
7018 static inline QEMU_ALWAYS_INLINE
7019 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7020                      target_ulong base, uint32_t desc, uintptr_t retaddr,
7021                      const int esz, const int msz,
7022                      zreg_off_fn *off_fn,
7023                      sve_ldst1_host_fn *host_fn,
7024                      sve_ldst1_tlb_fn *tlb_fn)
7025 {
7026     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7027     /* Remove mtedesc from the normal sve descriptor. */
7028     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7029 
7030     /*
7031      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7032      * offset base entirely over the address space hole to change the
7033      * pointer tag, or change the bit55 selector.  So we could here
7034      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7035      */
7036     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7037                 esz, msz, off_fn, host_fn, tlb_fn);
7038 }
7039 
7040 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
7041 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7042     (CPUARMState *env, void *vd, void *vg,                              \
7043      void *vm, target_ulong base, uint32_t desc)                        \
7044 {                                                                       \
7045     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
7046                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7047 }                                                                       \
7048 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7049     (CPUARMState *env, void *vd, void *vg,                              \
7050      void *vm, target_ulong base, uint32_t desc)                        \
7051 {                                                                       \
7052     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
7053                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7054 }
7055 
7056 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
7057 void HELPER(sve_ldff##MEM##_##OFS)                                      \
7058     (CPUARMState *env, void *vd, void *vg,                              \
7059      void *vm, target_ulong base, uint32_t desc)                        \
7060 {                                                                       \
7061     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
7062                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7063 }                                                                       \
7064 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
7065     (CPUARMState *env, void *vd, void *vg,                              \
7066      void *vm, target_ulong base, uint32_t desc)                        \
7067 {                                                                       \
7068     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
7069                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7070 }
7071 
7072 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
7073 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
7074 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
7075 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
7076 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
7077 
7078 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
7079 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
7080 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
7081 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
7082 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
7083 
7084 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
7085 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
7086 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
7087 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
7088 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
7089 
7090 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
7091 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
7092 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
7093 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
7094 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
7095 
7096 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
7097 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
7098 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
7099 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
7100 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
7101 
7102 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
7103 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
7104 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
7105 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
7106 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
7107 
7108 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
7109 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
7110 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
7111 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
7112 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
7113 
7114 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
7115 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
7116 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
7117 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
7118 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
7119 
7120 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
7121 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
7122 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
7123 
7124 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
7125 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
7126 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
7127 
7128 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
7129 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
7130 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
7131 
7132 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
7133 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
7134 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
7135 
7136 /* Stores with a vector index.  */
7137 
7138 static inline QEMU_ALWAYS_INLINE
7139 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7140                target_ulong base, uint32_t desc, uintptr_t retaddr,
7141                uint32_t mtedesc, int esize, int msize,
7142                zreg_off_fn *off_fn,
7143                sve_ldst1_host_fn *host_fn,
7144                sve_ldst1_tlb_fn *tlb_fn)
7145 {
7146     const int mmu_idx = arm_env_mmu_index(env);
7147     const intptr_t reg_max = simd_oprsz(desc);
7148     const int scale = simd_data(desc);
7149     void *host[ARM_MAX_VQ * 4];
7150     intptr_t reg_off, i;
7151     SVEHostPage info, info2;
7152 
7153     /*
7154      * Probe all of the elements for host addresses and flags.
7155      */
7156     i = reg_off = 0;
7157     do {
7158         uint64_t pg = vg[reg_off >> 6];
7159         do {
7160             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7161             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
7162 
7163             host[i] = NULL;
7164             if (likely((pg >> (reg_off & 63)) & 1)) {
7165                 if (likely(in_page >= msize)) {
7166                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
7167                                    mmu_idx, retaddr);
7168                     if (!(info.flags & TLB_MMIO)) {
7169                         host[i] = info.host;
7170                     }
7171                 } else {
7172                     /*
7173                      * Element crosses the page boundary.
7174                      * Probe both pages, but do not record the host address,
7175                      * so that we use the slow path.
7176                      */
7177                     sve_probe_page(&info, false, env, addr, 0,
7178                                    MMU_DATA_STORE, mmu_idx, retaddr);
7179                     sve_probe_page(&info2, false, env, addr + in_page, 0,
7180                                    MMU_DATA_STORE, mmu_idx, retaddr);
7181                     info.flags |= info2.flags;
7182                 }
7183 
7184                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
7185                     cpu_check_watchpoint(env_cpu(env), addr, msize,
7186                                          info.attrs, BP_MEM_WRITE, retaddr);
7187                 }
7188 
7189                 if (mtedesc && info.tagged) {
7190                     mte_check(env, mtedesc, addr, retaddr);
7191                 }
7192             }
7193             i += 1;
7194             reg_off += esize;
7195         } while (reg_off & 63);
7196     } while (reg_off < reg_max);
7197 
7198     /*
7199      * Now that we have recognized all exceptions except SyncExternal
7200      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7201      *
7202      * Note for the common case of an element in RAM, not crossing a page
7203      * boundary, we have stored the host address in host[].  This doubles
7204      * as a first-level check against the predicate, since only enabled
7205      * elements have non-null host addresses.
7206      */
7207     i = reg_off = 0;
7208     do {
7209         void *h = host[i];
7210         if (likely(h != NULL)) {
7211             set_helper_retaddr(retaddr);
7212             host_fn(vd, reg_off, h);
7213             clear_helper_retaddr();
7214         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7215             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7216             tlb_fn(env, vd, reg_off, addr, retaddr);
7217         }
7218         i += 1;
7219         reg_off += esize;
7220     } while (reg_off < reg_max);
7221 }
7222 
7223 static inline QEMU_ALWAYS_INLINE
7224 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7225                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7226                    int esize, int msize, zreg_off_fn *off_fn,
7227                    sve_ldst1_host_fn *host_fn,
7228                    sve_ldst1_tlb_fn *tlb_fn)
7229 {
7230     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7231     /* Remove mtedesc from the normal sve descriptor. */
7232     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7233 
7234     /*
7235      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7236      * offset base entirely over the address space hole to change the
7237      * pointer tag, or change the bit55 selector.  So we could here
7238      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7239      */
7240     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7241               esize, msize, off_fn, host_fn, tlb_fn);
7242 }
7243 
7244 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7245 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7246                                  void *vm, target_ulong base, uint32_t desc) \
7247 {                                                                       \
7248     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7249               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7250 }                                                                       \
7251 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7252     void *vm, target_ulong base, uint32_t desc)                         \
7253 {                                                                       \
7254     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7255                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7256 }
7257 
7258 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7259 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7260                                  void *vm, target_ulong base, uint32_t desc) \
7261 {                                                                       \
7262     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7263               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7264 }                                                                       \
7265 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7266     void *vm, target_ulong base, uint32_t desc)                         \
7267 {                                                                       \
7268     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7269                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7270 }
7271 
7272 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7273 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7274 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7275 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7276 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7277 
7278 DO_ST1_ZPZ_S(bs, zss, MO_8)
7279 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7280 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7281 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7282 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7283 
7284 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7285 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7286 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7287 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7288 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7289 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7290 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7291 
7292 DO_ST1_ZPZ_D(bd, zss, MO_8)
7293 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7294 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7295 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7296 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7297 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7298 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7299 
7300 DO_ST1_ZPZ_D(bd, zd, MO_8)
7301 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7302 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7303 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7304 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7305 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7306 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7307 
7308 #undef DO_ST1_ZPZ_S
7309 #undef DO_ST1_ZPZ_D
7310 
7311 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7312 {
7313     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7314     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7315 
7316     for (i = 0; i < opr_sz; ++i) {
7317         d[i] = n[i] ^ m[i] ^ k[i];
7318     }
7319 }
7320 
7321 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7322 {
7323     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7324     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7325 
7326     for (i = 0; i < opr_sz; ++i) {
7327         d[i] = n[i] ^ (m[i] & ~k[i]);
7328     }
7329 }
7330 
7331 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7332 {
7333     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7334     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7335 
7336     for (i = 0; i < opr_sz; ++i) {
7337         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7338     }
7339 }
7340 
7341 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7342 {
7343     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7344     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7345 
7346     for (i = 0; i < opr_sz; ++i) {
7347         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7348     }
7349 }
7350 
7351 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7352 {
7353     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7354     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7355 
7356     for (i = 0; i < opr_sz; ++i) {
7357         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7358     }
7359 }
7360 
7361 /*
7362  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7363  * See hasless(v,1) from
7364  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7365  */
7366 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7367 {
7368     int bits = 8 << esz;
7369     uint64_t ones = dup_const(esz, 1);
7370     uint64_t signs = ones << (bits - 1);
7371     uint64_t cmp0, cmp1;
7372 
7373     cmp1 = dup_const(esz, n);
7374     cmp0 = cmp1 ^ m0;
7375     cmp1 = cmp1 ^ m1;
7376     cmp0 = (cmp0 - ones) & ~cmp0;
7377     cmp1 = (cmp1 - ones) & ~cmp1;
7378     return (cmp0 | cmp1) & signs;
7379 }
7380 
7381 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7382                                 uint32_t desc, int esz, bool nmatch)
7383 {
7384     uint16_t esz_mask = pred_esz_masks[esz];
7385     intptr_t opr_sz = simd_oprsz(desc);
7386     uint32_t flags = PREDTEST_INIT;
7387     intptr_t i, j, k;
7388 
7389     for (i = 0; i < opr_sz; i += 16) {
7390         uint64_t m0 = *(uint64_t *)(vm + i);
7391         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7392         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7393         uint16_t out = 0;
7394 
7395         for (j = 0; j < 16; j += 8) {
7396             uint64_t n = *(uint64_t *)(vn + i + j);
7397 
7398             for (k = 0; k < 8; k += 1 << esz) {
7399                 if (pg & (1 << (j + k))) {
7400                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7401                     out |= (o ^ nmatch) << (j + k);
7402                 }
7403             }
7404         }
7405         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7406         flags = iter_predtest_fwd(out, pg, flags);
7407     }
7408     return flags;
7409 }
7410 
7411 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7412 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7413 {                                                                             \
7414     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7415 }
7416 
7417 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7418 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7419 
7420 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7421 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7422 
7423 #undef DO_PPZZ_MATCH
7424 
7425 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7426                             uint32_t desc)
7427 {
7428     ARMVectorReg scratch;
7429     intptr_t i, j;
7430     intptr_t opr_sz = simd_oprsz(desc);
7431     uint32_t *d = vd, *n = vn, *m = vm;
7432     uint8_t *pg = vg;
7433 
7434     if (d == n) {
7435         n = memcpy(&scratch, n, opr_sz);
7436         if (d == m) {
7437             m = n;
7438         }
7439     } else if (d == m) {
7440         m = memcpy(&scratch, m, opr_sz);
7441     }
7442 
7443     for (i = 0; i < opr_sz; i += 4) {
7444         uint64_t count = 0;
7445         uint8_t pred;
7446 
7447         pred = pg[H1(i >> 3)] >> (i & 7);
7448         if (pred & 1) {
7449             uint32_t nn = n[H4(i >> 2)];
7450 
7451             for (j = 0; j <= i; j += 4) {
7452                 pred = pg[H1(j >> 3)] >> (j & 7);
7453                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7454                     ++count;
7455                 }
7456             }
7457         }
7458         d[H4(i >> 2)] = count;
7459     }
7460 }
7461 
7462 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7463                             uint32_t desc)
7464 {
7465     ARMVectorReg scratch;
7466     intptr_t i, j;
7467     intptr_t opr_sz = simd_oprsz(desc);
7468     uint64_t *d = vd, *n = vn, *m = vm;
7469     uint8_t *pg = vg;
7470 
7471     if (d == n) {
7472         n = memcpy(&scratch, n, opr_sz);
7473         if (d == m) {
7474             m = n;
7475         }
7476     } else if (d == m) {
7477         m = memcpy(&scratch, m, opr_sz);
7478     }
7479 
7480     for (i = 0; i < opr_sz / 8; ++i) {
7481         uint64_t count = 0;
7482         if (pg[H1(i)] & 1) {
7483             uint64_t nn = n[i];
7484             for (j = 0; j <= i; ++j) {
7485                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7486                     ++count;
7487                 }
7488             }
7489         }
7490         d[i] = count;
7491     }
7492 }
7493 
7494 /*
7495  * Returns the number of bytes in m0 and m1 that match n.
7496  * Unlike do_match2 we don't just need true/false, we need an exact count.
7497  * This requires two extra logical operations.
7498  */
7499 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7500 {
7501     const uint64_t mask = dup_const(MO_8, 0x7f);
7502     uint64_t cmp0, cmp1;
7503 
7504     cmp1 = dup_const(MO_8, n);
7505     cmp0 = cmp1 ^ m0;
7506     cmp1 = cmp1 ^ m1;
7507 
7508     /*
7509      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7510      * 2: carry in to msb if byte != 0 (+ mask)
7511      * 3: set msb if cmp has msb set (| cmp)
7512      * 4: set ~msb to ignore them (| mask)
7513      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7514      * 5: invert, resulting in 0x80 if and only if byte == 0.
7515      */
7516     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7517     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7518 
7519     /*
7520      * Combine the two compares in a way that the bits do
7521      * not overlap, and so preserves the count of set bits.
7522      * If the host has an efficient instruction for ctpop,
7523      * then ctpop(x) + ctpop(y) has the same number of
7524      * operations as ctpop(x | (y >> 1)).  If the host does
7525      * not have an efficient ctpop, then we only want to
7526      * use it once.
7527      */
7528     return ctpop64(cmp0 | (cmp1 >> 1));
7529 }
7530 
7531 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7532 {
7533     intptr_t i, j;
7534     intptr_t opr_sz = simd_oprsz(desc);
7535 
7536     for (i = 0; i < opr_sz; i += 16) {
7537         uint64_t n0 = *(uint64_t *)(vn + i);
7538         uint64_t m0 = *(uint64_t *)(vm + i);
7539         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7540         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7541         uint64_t out0 = 0;
7542         uint64_t out1 = 0;
7543 
7544         for (j = 0; j < 64; j += 8) {
7545             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7546             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7547             out0 |= cnt0 << j;
7548             out1 |= cnt1 << j;
7549         }
7550 
7551         *(uint64_t *)(vd + i) = out0;
7552         *(uint64_t *)(vd + i + 8) = out1;
7553     }
7554 }
7555 
7556 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7557 {
7558     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7559     int shr = simd_data(desc);
7560     int shl = 8 - shr;
7561     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7562     uint64_t *d = vd, *n = vn, *m = vm;
7563 
7564     for (i = 0; i < opr_sz; ++i) {
7565         uint64_t t = n[i] ^ m[i];
7566         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7567     }
7568 }
7569 
7570 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7571 {
7572     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7573     int shr = simd_data(desc);
7574     int shl = 16 - shr;
7575     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7576     uint64_t *d = vd, *n = vn, *m = vm;
7577 
7578     for (i = 0; i < opr_sz; ++i) {
7579         uint64_t t = n[i] ^ m[i];
7580         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7581     }
7582 }
7583 
7584 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7585 {
7586     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7587     int shr = simd_data(desc);
7588     uint32_t *d = vd, *n = vn, *m = vm;
7589 
7590     for (i = 0; i < opr_sz; ++i) {
7591         d[i] = ror32(n[i] ^ m[i], shr);
7592     }
7593 }
7594 
7595 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7596                      float_status *status, uint32_t desc)
7597 {
7598     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7599 
7600     for (s = 0; s < opr_sz; ++s) {
7601         float32 *n = vn + s * sizeof(float32) * 4;
7602         float32 *m = vm + s * sizeof(float32) * 4;
7603         float32 *a = va + s * sizeof(float32) * 4;
7604         float32 *d = vd + s * sizeof(float32) * 4;
7605         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7606         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7607         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7608         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7609         float32 p0, p1;
7610 
7611         /* i = 0, j = 0 */
7612         p0 = float32_mul(n00, m00, status);
7613         p1 = float32_mul(n01, m01, status);
7614         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7615 
7616         /* i = 0, j = 1 */
7617         p0 = float32_mul(n00, m10, status);
7618         p1 = float32_mul(n01, m11, status);
7619         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7620 
7621         /* i = 1, j = 0 */
7622         p0 = float32_mul(n10, m00, status);
7623         p1 = float32_mul(n11, m01, status);
7624         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7625 
7626         /* i = 1, j = 1 */
7627         p0 = float32_mul(n10, m10, status);
7628         p1 = float32_mul(n11, m11, status);
7629         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7630     }
7631 }
7632 
7633 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7634                      float_status *status, uint32_t desc)
7635 {
7636     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7637 
7638     for (s = 0; s < opr_sz; ++s) {
7639         float64 *n = vn + s * sizeof(float64) * 4;
7640         float64 *m = vm + s * sizeof(float64) * 4;
7641         float64 *a = va + s * sizeof(float64) * 4;
7642         float64 *d = vd + s * sizeof(float64) * 4;
7643         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7644         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7645         float64 p0, p1;
7646 
7647         /* i = 0, j = 0 */
7648         p0 = float64_mul(n00, m00, status);
7649         p1 = float64_mul(n01, m01, status);
7650         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7651 
7652         /* i = 0, j = 1 */
7653         p0 = float64_mul(n00, m10, status);
7654         p1 = float64_mul(n01, m11, status);
7655         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7656 
7657         /* i = 1, j = 0 */
7658         p0 = float64_mul(n10, m00, status);
7659         p1 = float64_mul(n11, m01, status);
7660         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7661 
7662         /* i = 1, j = 1 */
7663         p0 = float64_mul(n10, m10, status);
7664         p1 = float64_mul(n11, m11, status);
7665         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7666     }
7667 }
7668 
7669 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7670 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7671                   float_status *status, uint32_t desc)                        \
7672 {                                                                             \
7673     intptr_t i = simd_oprsz(desc);                                            \
7674     uint64_t *g = vg;                                                         \
7675     do {                                                                      \
7676         uint64_t pg = g[(i - 1) >> 6];                                        \
7677         do {                                                                  \
7678             i -= sizeof(TYPEW);                                               \
7679             if (likely((pg >> (i & 63)) & 1)) {                               \
7680                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7681                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7682             }                                                                 \
7683         } while (i & 63);                                                     \
7684     } while (i != 0);                                                         \
7685 }
7686 
7687 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7688 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7689 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7690 
7691 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7692 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7693                   float_status *status, uint32_t desc)                        \
7694 {                                                                             \
7695     intptr_t i = simd_oprsz(desc);                                            \
7696     uint64_t *g = vg;                                                         \
7697     do {                                                                      \
7698         uint64_t pg = g[(i - 1) >> 6];                                        \
7699         do {                                                                  \
7700             i -= sizeof(TYPEW);                                               \
7701             if (likely((pg >> (i & 63)) & 1)) {                               \
7702                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7703                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7704             }                                                                 \
7705         } while (i & 63);                                                     \
7706     } while (i != 0);                                                         \
7707 }
7708 
7709 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7710 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7711 
7712 #undef DO_FCVTLT
7713 #undef DO_FCVTNT
7714