xref: /qemu/target/arm/tcg/sve_helper.c (revision 68df8c8dba57f539d24f1a92a8699a179d9bb6fb)
1 /*
2  * ARM SVE Operations
3  *
4  * Copyright (c) 2018 Linaro, Ltd.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/page-protection.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29 #include "vec_internal.h"
30 #include "sve_ldst_internal.h"
31 #include "hw/core/tcg-cpu-ops.h"
32 #ifdef CONFIG_USER_ONLY
33 #include "user/page-protection.h"
34 #endif
35 
36 
37 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
38  *
39  * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
40  * and bit 0 set if C is set.  Compare the definitions of these variables
41  * within CPUARMState.
42  */
43 
44 /* For no G bits set, NZCV = C.  */
45 #define PREDTEST_INIT  1
46 
47 /* This is an iterative function, called for each Pd and Pg word
48  * moving forward.
49  */
50 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
51 {
52     if (likely(g)) {
53         /* Compute N from first D & G.
54            Use bit 2 to signal first G bit seen.  */
55         if (!(flags & 4)) {
56             flags |= ((d & (g & -g)) != 0) << 31;
57             flags |= 4;
58         }
59 
60         /* Accumulate Z from each D & G.  */
61         flags |= ((d & g) != 0) << 1;
62 
63         /* Compute C from last !(D & G).  Replace previous.  */
64         flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
65     }
66     return flags;
67 }
68 
69 /* This is an iterative function, called for each Pd and Pg word
70  * moving backward.
71  */
72 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
73 {
74     if (likely(g)) {
75         /* Compute C from first (i.e last) !(D & G).
76            Use bit 2 to signal first G bit seen.  */
77         if (!(flags & 4)) {
78             flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
79             flags |= (d & pow2floor(g)) == 0;
80         }
81 
82         /* Accumulate Z from each D & G.  */
83         flags |= ((d & g) != 0) << 1;
84 
85         /* Compute N from last (i.e first) D & G.  Replace previous.  */
86         flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
87     }
88     return flags;
89 }
90 
91 /* The same for a single word predicate.  */
92 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
93 {
94     return iter_predtest_fwd(d, g, PREDTEST_INIT);
95 }
96 
97 /* The same for a multi-word predicate.  */
98 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
99 {
100     uint32_t flags = PREDTEST_INIT;
101     uint64_t *d = vd, *g = vg;
102     uintptr_t i = 0;
103 
104     do {
105         flags = iter_predtest_fwd(d[i], g[i], flags);
106     } while (++i < words);
107 
108     return flags;
109 }
110 
111 /* Similarly for single word elements.  */
112 static inline uint64_t expand_pred_s(uint8_t byte)
113 {
114     static const uint64_t word[] = {
115         [0x01] = 0x00000000ffffffffull,
116         [0x10] = 0xffffffff00000000ull,
117         [0x11] = 0xffffffffffffffffull,
118     };
119     return word[byte & 0x11];
120 }
121 
122 #define LOGICAL_PPPP(NAME, FUNC) \
123 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
124 {                                                                         \
125     uintptr_t opr_sz = simd_oprsz(desc);                                  \
126     uint64_t *d = vd, *n = vn, *m = vm, *g = vg;                          \
127     uintptr_t i;                                                          \
128     for (i = 0; i < opr_sz / 8; ++i) {                                    \
129         d[i] = FUNC(n[i], m[i], g[i]);                                    \
130     }                                                                     \
131 }
132 
133 #define DO_AND(N, M, G)  (((N) & (M)) & (G))
134 #define DO_BIC(N, M, G)  (((N) & ~(M)) & (G))
135 #define DO_EOR(N, M, G)  (((N) ^ (M)) & (G))
136 #define DO_ORR(N, M, G)  (((N) | (M)) & (G))
137 #define DO_ORN(N, M, G)  (((N) | ~(M)) & (G))
138 #define DO_NOR(N, M, G)  (~((N) | (M)) & (G))
139 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
140 #define DO_SEL(N, M, G)  (((N) & (G)) | ((M) & ~(G)))
141 
142 LOGICAL_PPPP(sve_and_pppp, DO_AND)
143 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
144 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
145 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
146 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
147 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
148 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
149 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
150 
151 #undef DO_AND
152 #undef DO_BIC
153 #undef DO_EOR
154 #undef DO_ORR
155 #undef DO_ORN
156 #undef DO_NOR
157 #undef DO_NAND
158 #undef DO_SEL
159 #undef LOGICAL_PPPP
160 
161 /* Fully general three-operand expander, controlled by a predicate.
162  * This is complicated by the host-endian storage of the register file.
163  */
164 /* ??? I don't expect the compiler could ever vectorize this itself.
165  * With some tables we can convert bit masks to byte masks, and with
166  * extra care wrt byte/word ordering we could use gcc generic vectors
167  * and do 16 bytes at a time.
168  */
169 #define DO_ZPZZ(NAME, TYPE, H, OP)                                       \
170 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
171 {                                                                       \
172     intptr_t i, opr_sz = simd_oprsz(desc);                              \
173     for (i = 0; i < opr_sz; ) {                                         \
174         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
175         do {                                                            \
176             if (pg & 1) {                                               \
177                 TYPE nn = *(TYPE *)(vn + H(i));                         \
178                 TYPE mm = *(TYPE *)(vm + H(i));                         \
179                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
180             }                                                           \
181             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
182         } while (i & 15);                                               \
183     }                                                                   \
184 }
185 
186 /* Similarly, specialized for 64-bit operands.  */
187 #define DO_ZPZZ_D(NAME, TYPE, OP)                                \
188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
189 {                                                               \
190     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
191     TYPE *d = vd, *n = vn, *m = vm;                             \
192     uint8_t *pg = vg;                                           \
193     for (i = 0; i < opr_sz; i += 1) {                           \
194         if (pg[H1(i)] & 1) {                                    \
195             TYPE nn = n[i], mm = m[i];                          \
196             d[i] = OP(nn, mm);                                  \
197         }                                                       \
198     }                                                           \
199 }
200 
201 #define DO_AND(N, M)  (N & M)
202 #define DO_EOR(N, M)  (N ^ M)
203 #define DO_ORR(N, M)  (N | M)
204 #define DO_BIC(N, M)  (N & ~M)
205 #define DO_ADD(N, M)  (N + M)
206 #define DO_SUB(N, M)  (N - M)
207 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
208 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
209 #define DO_ABD(N, M)  ((N) >= (M) ? (N) - (M) : (M) - (N))
210 #define DO_MUL(N, M)  (N * M)
211 
212 
213 /*
214  * We must avoid the C undefined behaviour cases: division by
215  * zero and signed division of INT_MIN by -1. Both of these
216  * have architecturally defined required results for Arm.
217  * We special case all signed divisions by -1 to avoid having
218  * to deduce the minimum integer for the type involved.
219  */
220 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
221 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
222 
223 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
224 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
225 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
226 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
227 
228 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
229 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
230 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
231 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
232 
233 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
234 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
235 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
236 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
237 
238 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
239 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
240 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
241 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
242 
243 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
244 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
245 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
246 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
247 
248 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
249 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
250 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
251 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
252 
253 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
254 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
255 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
256 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
257 
258 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
259 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
260 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
261 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
262 
263 DO_ZPZZ(sve_smin_zpzz_b, int8_t,  H1, DO_MIN)
264 DO_ZPZZ(sve_smin_zpzz_h, int16_t,  H1_2, DO_MIN)
265 DO_ZPZZ(sve_smin_zpzz_s, int32_t,  H1_4, DO_MIN)
266 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t,  DO_MIN)
267 
268 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
269 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
270 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
271 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
272 
273 DO_ZPZZ(sve_sabd_zpzz_b, int8_t,  H1, DO_ABD)
274 DO_ZPZZ(sve_sabd_zpzz_h, int16_t,  H1_2, DO_ABD)
275 DO_ZPZZ(sve_sabd_zpzz_s, int32_t,  H1_4, DO_ABD)
276 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t,  DO_ABD)
277 
278 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
279 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
280 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
281 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
282 
283 /* Because the computation type is at least twice as large as required,
284    these work for both signed and unsigned source types.  */
285 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
286 {
287     return (n * m) >> 8;
288 }
289 
290 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
291 {
292     return (n * m) >> 16;
293 }
294 
295 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
296 {
297     return (n * m) >> 32;
298 }
299 
300 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
301 {
302     uint64_t lo, hi;
303     muls64(&lo, &hi, n, m);
304     return hi;
305 }
306 
307 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
308 {
309     uint64_t lo, hi;
310     mulu64(&lo, &hi, n, m);
311     return hi;
312 }
313 
314 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
315 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
316 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
317 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
318 
319 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
320 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
321 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
322 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
323 
324 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
325 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
326 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
327 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
328 
329 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
330 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
331 
332 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
333 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
334 
335 /* Note that all bits of the shift are significant
336    and not modulo the element size.  */
337 #define DO_ASR(N, M)  (N >> MIN(M, sizeof(N) * 8 - 1))
338 #define DO_LSR(N, M)  (M < sizeof(N) * 8 ? N >> M : 0)
339 #define DO_LSL(N, M)  (M < sizeof(N) * 8 ? N << M : 0)
340 
341 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
342 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
343 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
344 
345 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
346 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
347 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
348 
349 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
350 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
351 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
352 
353 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
354 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
355 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
356 
357 static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
358 {
359     int8_t n1 = n, n2 = n >> 8;
360     return m + n1 + n2;
361 }
362 
363 static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
364 {
365     int16_t n1 = n, n2 = n >> 16;
366     return m + n1 + n2;
367 }
368 
369 static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
370 {
371     int32_t n1 = n, n2 = n >> 32;
372     return m + n1 + n2;
373 }
374 
375 DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
376 DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
377 DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
378 
379 static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
380 {
381     uint8_t n1 = n, n2 = n >> 8;
382     return m + n1 + n2;
383 }
384 
385 static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
386 {
387     uint16_t n1 = n, n2 = n >> 16;
388     return m + n1 + n2;
389 }
390 
391 static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
392 {
393     uint32_t n1 = n, n2 = n >> 32;
394     return m + n1 + n2;
395 }
396 
397 DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
398 DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
399 DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
400 
401 #define do_srshl_b(n, m)  do_sqrshl_bhs(n, m, 8, true, NULL)
402 #define do_srshl_h(n, m)  do_sqrshl_bhs(n, m, 16, true, NULL)
403 #define do_srshl_s(n, m)  do_sqrshl_bhs(n, m, 32, true, NULL)
404 #define do_srshl_d(n, m)  do_sqrshl_d(n, m, true, NULL)
405 
406 DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
407 DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
408 DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
409 DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
410 
411 #define do_urshl_b(n, m)  do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
412 #define do_urshl_h(n, m)  do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
413 #define do_urshl_s(n, m)  do_uqrshl_bhs(n, m, 32, true, NULL)
414 #define do_urshl_d(n, m)  do_uqrshl_d(n, m, true, NULL)
415 
416 DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
417 DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
418 DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
419 DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
420 
421 /*
422  * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
423  * We pass in a pointer to a dummy saturation field to trigger
424  * the saturating arithmetic but discard the information about
425  * whether it has occurred.
426  */
427 #define do_sqshl_b(n, m) \
428    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
429 #define do_sqshl_h(n, m) \
430    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
431 #define do_sqshl_s(n, m) \
432    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
433 #define do_sqshl_d(n, m) \
434    ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
435 
436 DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
437 DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
438 DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
439 DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
440 
441 #define do_uqshl_b(n, m) \
442    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
443 #define do_uqshl_h(n, m) \
444    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
445 #define do_uqshl_s(n, m) \
446    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
447 #define do_uqshl_d(n, m) \
448    ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
449 
450 DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
451 DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
452 DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
453 DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
454 
455 #define do_sqrshl_b(n, m) \
456    ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
457 #define do_sqrshl_h(n, m) \
458    ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
459 #define do_sqrshl_s(n, m) \
460    ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
461 #define do_sqrshl_d(n, m) \
462    ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
463 
464 DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
465 DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
466 DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
467 DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
468 
469 #undef do_sqrshl_d
470 
471 #define do_uqrshl_b(n, m) \
472    ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
473 #define do_uqrshl_h(n, m) \
474    ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
475 #define do_uqrshl_s(n, m) \
476    ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
477 #define do_uqrshl_d(n, m) \
478    ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
479 
480 DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
481 DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
482 DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
483 DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
484 
485 #undef do_uqrshl_d
486 
487 #define DO_HADD_BHS(n, m)  (((int64_t)n + m) >> 1)
488 #define DO_HADD_D(n, m)    ((n >> 1) + (m >> 1) + (n & m & 1))
489 
490 DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
491 DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
492 DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
493 DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
494 
495 DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
496 DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
497 DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
498 DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
499 
500 #define DO_RHADD_BHS(n, m)  (((int64_t)n + m + 1) >> 1)
501 #define DO_RHADD_D(n, m)    ((n >> 1) + (m >> 1) + ((n | m) & 1))
502 
503 DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
504 DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
505 DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
506 DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
507 
508 DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
509 DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
510 DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
511 DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
512 
513 #define DO_HSUB_BHS(n, m)  (((int64_t)n - m) >> 1)
514 #define DO_HSUB_D(n, m)    ((n >> 1) - (m >> 1) - (~n & m & 1))
515 
516 DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
517 DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
518 DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
519 DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
520 
521 DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
522 DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
523 DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
524 DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
525 
526 static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
527 {
528     return val >= max ? max : val <= min ? min : val;
529 }
530 
531 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
532 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
533 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
534 
535 static inline int64_t do_sqadd_d(int64_t n, int64_t m)
536 {
537     int64_t r = n + m;
538     if (((r ^ n) & ~(n ^ m)) < 0) {
539         /* Signed overflow.  */
540         return r < 0 ? INT64_MAX : INT64_MIN;
541     }
542     return r;
543 }
544 
545 DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
546 DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
547 DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
548 DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
549 
550 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
551 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
552 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
553 
554 static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
555 {
556     uint64_t r = n + m;
557     return r < n ? UINT64_MAX : r;
558 }
559 
560 DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
561 DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
562 DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
563 DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
564 
565 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
566 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
567 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
568 
569 static inline int64_t do_sqsub_d(int64_t n, int64_t m)
570 {
571     int64_t r = n - m;
572     if (((r ^ n) & (n ^ m)) < 0) {
573         /* Signed overflow.  */
574         return r < 0 ? INT64_MAX : INT64_MIN;
575     }
576     return r;
577 }
578 
579 DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
580 DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
581 DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
582 DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
583 
584 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
585 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
586 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
587 
588 static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
589 {
590     return n > m ? n - m : 0;
591 }
592 
593 DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
594 DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
595 DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
596 DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
597 
598 #define DO_SUQADD_B(n, m) \
599     do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
600 #define DO_SUQADD_H(n, m) \
601     do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
602 #define DO_SUQADD_S(n, m) \
603     do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
604 
605 static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
606 {
607     uint64_t r = n + m;
608 
609     if (n < 0) {
610         /* Note that m - abs(n) cannot underflow. */
611         if (r > INT64_MAX) {
612             /* Result is either very large positive or negative. */
613             if (m > -n) {
614                 /* m > abs(n), so r is a very large positive. */
615                 return INT64_MAX;
616             }
617             /* Result is negative. */
618         }
619     } else {
620         /* Both inputs are positive: check for overflow.  */
621         if (r < m || r > INT64_MAX) {
622             return INT64_MAX;
623         }
624     }
625     return r;
626 }
627 
628 DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
629 DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
630 DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
631 DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
632 
633 #define DO_USQADD_B(n, m) \
634     do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
635 #define DO_USQADD_H(n, m) \
636     do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
637 #define DO_USQADD_S(n, m) \
638     do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
639 
640 static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
641 {
642     uint64_t r = n + m;
643 
644     if (m < 0) {
645         return n < -m ? 0 : r;
646     }
647     return r < n ? UINT64_MAX : r;
648 }
649 
650 DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
651 DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
652 DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
653 DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
654 
655 #undef DO_ZPZZ
656 #undef DO_ZPZZ_D
657 
658 /*
659  * Three operand expander, operating on element pairs.
660  * If the slot I is even, the elements from from VN {I, I+1}.
661  * If the slot I is odd, the elements from from VM {I-1, I}.
662  * Load all of the input elements in each pair before overwriting output.
663  */
664 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
665 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
666 {                                                               \
667     intptr_t i, opr_sz = simd_oprsz(desc);                      \
668     for (i = 0; i < opr_sz; ) {                                 \
669         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
670         do {                                                    \
671             TYPE n0 = *(TYPE *)(vn + H(i));                     \
672             TYPE m0 = *(TYPE *)(vm + H(i));                     \
673             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));      \
674             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));      \
675             if (pg & 1) {                                       \
676                 *(TYPE *)(vd + H(i)) = OP(n0, n1);              \
677             }                                                   \
678             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
679             if (pg & 1) {                                       \
680                 *(TYPE *)(vd + H(i)) = OP(m0, m1);              \
681             }                                                   \
682             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
683         } while (i & 15);                                       \
684     }                                                           \
685 }
686 
687 /* Similarly, specialized for 64-bit operands.  */
688 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
689 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
690 {                                                               \
691     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
692     TYPE *d = vd, *n = vn, *m = vm;                             \
693     uint8_t *pg = vg;                                           \
694     for (i = 0; i < opr_sz; i += 2) {                           \
695         TYPE n0 = n[i], n1 = n[i + 1];                          \
696         TYPE m0 = m[i], m1 = m[i + 1];                          \
697         if (pg[H1(i)] & 1) {                                    \
698             d[i] = OP(n0, n1);                                  \
699         }                                                       \
700         if (pg[H1(i + 1)] & 1) {                                \
701             d[i + 1] = OP(m0, m1);                              \
702         }                                                       \
703     }                                                           \
704 }
705 
706 DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
707 DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
708 DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
709 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
710 
711 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
712 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
713 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
714 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
715 
716 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
717 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
718 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
719 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
720 
721 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
722 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
723 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
724 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
725 
726 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
727 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
728 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
729 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
730 
731 #undef DO_ZPZZ_PAIR
732 #undef DO_ZPZZ_PAIR_D
733 
734 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP)                              \
735 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
736                   float_status *status, uint32_t desc)                  \
737 {                                                                       \
738     intptr_t i, opr_sz = simd_oprsz(desc);                              \
739     for (i = 0; i < opr_sz; ) {                                         \
740         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));                 \
741         do {                                                            \
742             TYPE n0 = *(TYPE *)(vn + H(i));                             \
743             TYPE m0 = *(TYPE *)(vm + H(i));                             \
744             TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE)));              \
745             TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE)));              \
746             if (pg & 1) {                                               \
747                 *(TYPE *)(vd + H(i)) = OP(n0, n1, status);              \
748             }                                                           \
749             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
750             if (pg & 1) {                                               \
751                 *(TYPE *)(vd + H(i)) = OP(m0, m1, status);              \
752             }                                                           \
753             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
754         } while (i & 15);                                               \
755     }                                                                   \
756 }
757 
758 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
759 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
760 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
761 
762 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
763 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
764 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
765 
766 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
767 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
768 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
769 
770 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
771 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
772 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
773 
774 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
775 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
776 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
777 
778 #undef DO_ZPZZ_PAIR_FP
779 
780 /* Three-operand expander, controlled by a predicate, in which the
781  * third operand is "wide".  That is, for D = N op M, the same 64-bit
782  * value of M is used with all of the narrower values of N.
783  */
784 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP)                               \
785 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
786 {                                                                       \
787     intptr_t i, opr_sz = simd_oprsz(desc);                              \
788     for (i = 0; i < opr_sz; ) {                                         \
789         uint8_t pg = *(uint8_t *)(vg + H1(i >> 3));                     \
790         TYPEW mm = *(TYPEW *)(vm + i);                                  \
791         do {                                                            \
792             if (pg & 1) {                                               \
793                 TYPE nn = *(TYPE *)(vn + H(i));                         \
794                 *(TYPE *)(vd + H(i)) = OP(nn, mm);                      \
795             }                                                           \
796             i += sizeof(TYPE), pg >>= sizeof(TYPE);                     \
797         } while (i & 7);                                                \
798     }                                                                   \
799 }
800 
801 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
802 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
803 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
804 
805 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
806 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
807 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
808 
809 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
810 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
811 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
812 
813 #undef DO_ZPZW
814 
815 /* Fully general two-operand expander, controlled by a predicate.
816  */
817 #define DO_ZPZ(NAME, TYPE, H, OP)                               \
818 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
819 {                                                               \
820     intptr_t i, opr_sz = simd_oprsz(desc);                      \
821     for (i = 0; i < opr_sz; ) {                                 \
822         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
823         do {                                                    \
824             if (pg & 1) {                                       \
825                 TYPE nn = *(TYPE *)(vn + H(i));                 \
826                 *(TYPE *)(vd + H(i)) = OP(nn);                  \
827             }                                                   \
828             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
829         } while (i & 15);                                       \
830     }                                                           \
831 }
832 
833 /* Similarly, specialized for 64-bit operands.  */
834 #define DO_ZPZ_D(NAME, TYPE, OP)                                \
835 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
836 {                                                               \
837     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
838     TYPE *d = vd, *n = vn;                                      \
839     uint8_t *pg = vg;                                           \
840     for (i = 0; i < opr_sz; i += 1) {                           \
841         if (pg[H1(i)] & 1) {                                    \
842             TYPE nn = n[i];                                     \
843             d[i] = OP(nn);                                      \
844         }                                                       \
845     }                                                           \
846 }
847 
848 #define DO_CLS_B(N)   (clrsb32(N) - 24)
849 #define DO_CLS_H(N)   (clrsb32(N) - 16)
850 
851 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
852 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
853 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
854 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
855 
856 #define DO_CLZ_B(N)   (clz32(N) - 24)
857 #define DO_CLZ_H(N)   (clz32(N) - 16)
858 
859 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
860 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
861 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
862 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
863 
864 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
865 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
866 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
867 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
868 
869 #define DO_CNOT(N)    (N == 0)
870 
871 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
872 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
873 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
874 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
875 
876 #define DO_FABS(N)    (N & ((__typeof(N))-1 >> 1))
877 
878 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
879 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
880 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
881 
882 #define DO_FNEG(N)    (N ^ ~((__typeof(N))-1 >> 1))
883 
884 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
885 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
886 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
887 
888 #define DO_NOT(N)    (~N)
889 
890 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
891 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
892 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
893 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
894 
895 #define DO_SXTB(N)    ((int8_t)N)
896 #define DO_SXTH(N)    ((int16_t)N)
897 #define DO_SXTS(N)    ((int32_t)N)
898 #define DO_UXTB(N)    ((uint8_t)N)
899 #define DO_UXTH(N)    ((uint16_t)N)
900 #define DO_UXTS(N)    ((uint32_t)N)
901 
902 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
903 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
904 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
905 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
906 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
907 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
908 
909 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
910 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
911 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
912 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
913 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
914 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
915 
916 #define DO_ABS(N)    (N < 0 ? -N : N)
917 
918 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
919 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
920 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
921 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
922 
923 #define DO_NEG(N)    (-N)
924 
925 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
926 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
927 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
928 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
929 
930 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
931 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
932 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
933 
934 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
935 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
936 
937 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
938 
939 void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
940 {
941     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
942     uint64_t *d = vd, *n = vn;
943     uint8_t *pg = vg;
944 
945     for (i = 0; i < opr_sz; i += 2) {
946         if (pg[H1(i)] & 1) {
947             uint64_t n0 = n[i + 0];
948             uint64_t n1 = n[i + 1];
949             d[i + 0] = n1;
950             d[i + 1] = n0;
951         }
952     }
953 }
954 
955 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
956 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
957 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
958 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
959 
960 #define DO_SQABS(X) \
961     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
962        x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
963 
964 DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
965 DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
966 DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
967 DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
968 
969 #define DO_SQNEG(X) \
970     ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
971        x_ == min_ ? -min_ - 1 : -x_; })
972 
973 DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
974 DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
975 DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
976 DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
977 
978 DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
979 DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
980 
981 /* Three-operand expander, unpredicated, in which the third operand is "wide".
982  */
983 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP)                       \
984 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
985 {                                                              \
986     intptr_t i, opr_sz = simd_oprsz(desc);                     \
987     for (i = 0; i < opr_sz; ) {                                \
988         TYPEW mm = *(TYPEW *)(vm + i);                         \
989         do {                                                   \
990             TYPE nn = *(TYPE *)(vn + H(i));                    \
991             *(TYPE *)(vd + H(i)) = OP(nn, mm);                 \
992             i += sizeof(TYPE);                                 \
993         } while (i & 7);                                       \
994     }                                                          \
995 }
996 
997 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
998 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
999 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1000 
1001 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1002 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1003 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1004 
1005 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1006 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1007 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1008 
1009 #undef DO_ZZW
1010 
1011 #undef DO_CLS_B
1012 #undef DO_CLS_H
1013 #undef DO_CLZ_B
1014 #undef DO_CLZ_H
1015 #undef DO_CNOT
1016 #undef DO_FABS
1017 #undef DO_FNEG
1018 #undef DO_ABS
1019 #undef DO_NEG
1020 #undef DO_ZPZ
1021 #undef DO_ZPZ_D
1022 
1023 /*
1024  * Three-operand expander, unpredicated, in which the two inputs are
1025  * selected from the top or bottom half of the wide column.
1026  */
1027 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1028 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1029 {                                                                       \
1030     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1031     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1032     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1033     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1034         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1035         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1036         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                            \
1037     }                                                                   \
1038 }
1039 
1040 DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1041 DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1042 DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1043 
1044 DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1045 DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1046 DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1047 
1048 DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1049 DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1050 DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1051 
1052 DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1053 DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1054 DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1055 
1056 DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1057 DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1058 DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1059 
1060 DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1061 DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1062 DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1063 
1064 DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1065 DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1066 DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1067 
1068 DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1069 DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1070 DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1071 
1072 /* Note that the multiply cannot overflow, but the doubling can. */
1073 static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1074 {
1075     int16_t val = n * m;
1076     return DO_SQADD_H(val, val);
1077 }
1078 
1079 static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1080 {
1081     int32_t val = n * m;
1082     return DO_SQADD_S(val, val);
1083 }
1084 
1085 static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1086 {
1087     int64_t val = n * m;
1088     return do_sqadd_d(val, val);
1089 }
1090 
1091 DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1092 DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1093 DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1094 
1095 #undef DO_ZZZ_TB
1096 
1097 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1098 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1099 {                                                              \
1100     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1101     int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1102     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1103         TYPEW nn = *(TYPEW *)(vn + HW(i));                     \
1104         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));              \
1105         *(TYPEW *)(vd + HW(i)) = OP(nn, mm);                   \
1106     }                                                          \
1107 }
1108 
1109 DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1110 DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1111 DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
1112 
1113 DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1114 DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1115 DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
1116 
1117 DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1118 DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1119 DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
1120 
1121 DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1122 DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1123 DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
1124 
1125 #undef DO_ZZZ_WTB
1126 
1127 #define DO_ZZZ_NTB(NAME, TYPE, H, OP)                                   \
1128 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)          \
1129 {                                                                       \
1130     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1131     intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1132     intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1133     for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {                    \
1134         TYPE nn = *(TYPE *)(vn + H(i + sel1));                          \
1135         TYPE mm = *(TYPE *)(vm + H(i + sel2));                          \
1136         *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm);                       \
1137     }                                                                   \
1138 }
1139 
1140 DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1141 DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1142 DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1143 DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
1144 
1145 #undef DO_ZZZ_NTB
1146 
1147 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1148 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1149 {                                                               \
1150     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1151     intptr_t sel1 = simd_data(desc) * sizeof(TYPEN);            \
1152     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {               \
1153         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));               \
1154         TYPEW mm = *(TYPEN *)(vm + HN(i + sel1));               \
1155         TYPEW aa = *(TYPEW *)(va + HW(i));                      \
1156         *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa;               \
1157     }                                                           \
1158 }
1159 
1160 DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1161 DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1162 DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
1163 
1164 DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1165 DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1166 DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
1167 
1168 DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1169 DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1170 DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1171 
1172 DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1173 DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1174 DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1175 
1176 #define DO_NMUL(N, M)  -(N * M)
1177 
1178 DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1179 DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1180 DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
1181 
1182 DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1183 DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1184 DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
1185 
1186 #undef DO_ZZZW_ACC
1187 
1188 #define DO_XTNB(NAME, TYPE, OP) \
1189 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
1190 {                                                            \
1191     intptr_t i, opr_sz = simd_oprsz(desc);                   \
1192     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {             \
1193         TYPE nn = *(TYPE *)(vn + i);                         \
1194         nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4);  \
1195         *(TYPE *)(vd + i) = nn;                              \
1196     }                                                        \
1197 }
1198 
1199 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP)                               \
1200 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)                    \
1201 {                                                                       \
1202     intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN));      \
1203     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {                        \
1204         TYPE nn = *(TYPE *)(vn + i);                                    \
1205         *(TYPEN *)(vd + i + odd) = OP(nn);                              \
1206     }                                                                   \
1207 }
1208 
1209 #define DO_SQXTN_H(n)  do_sat_bhs(n, INT8_MIN, INT8_MAX)
1210 #define DO_SQXTN_S(n)  do_sat_bhs(n, INT16_MIN, INT16_MAX)
1211 #define DO_SQXTN_D(n)  do_sat_bhs(n, INT32_MIN, INT32_MAX)
1212 
1213 DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1214 DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1215 DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1216 
1217 DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1218 DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1219 DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1220 
1221 #define DO_UQXTN_H(n)  do_sat_bhs(n, 0, UINT8_MAX)
1222 #define DO_UQXTN_S(n)  do_sat_bhs(n, 0, UINT16_MAX)
1223 #define DO_UQXTN_D(n)  do_sat_bhs(n, 0, UINT32_MAX)
1224 
1225 DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1226 DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1227 DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1228 
1229 DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1230 DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1231 DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1232 
1233 DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1234 DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1235 DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1236 
1237 DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1238 DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1239 DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1240 
1241 #undef DO_XTNB
1242 #undef DO_XTNT
1243 
1244 void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1245 {
1246     intptr_t i, opr_sz = simd_oprsz(desc);
1247     int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1248     uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1249     uint32_t *a = va, *n = vn;
1250     uint64_t *d = vd, *m = vm;
1251 
1252     for (i = 0; i < opr_sz / 8; ++i) {
1253         uint32_t e1 = a[2 * i + H4(0)];
1254         uint32_t e2 = n[2 * i + sel] ^ inv;
1255         uint64_t c = extract64(m[i], 32, 1);
1256         /* Compute and store the entire 33-bit result at once. */
1257         d[i] = c + e1 + e2;
1258     }
1259 }
1260 
1261 void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1262 {
1263     intptr_t i, opr_sz = simd_oprsz(desc);
1264     int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1265     uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1266     uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1267 
1268     for (i = 0; i < opr_sz / 8; i += 2) {
1269         Int128 e1 = int128_make64(a[i]);
1270         Int128 e2 = int128_make64(n[i + sel] ^ inv);
1271         Int128 c = int128_make64(m[i + 1] & 1);
1272         Int128 r = int128_add(int128_add(e1, e2), c);
1273         d[i + 0] = int128_getlo(r);
1274         d[i + 1] = int128_gethi(r);
1275     }
1276 }
1277 
1278 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1279 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1280 {                                                                       \
1281     intptr_t i, opr_sz = simd_oprsz(desc);                              \
1282     int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);     \
1283     int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1284     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                       \
1285         TYPEW nn = *(TYPEN *)(vn + HN(i + sel1));                       \
1286         TYPEW mm = *(TYPEN *)(vm + HN(i + sel2));                       \
1287         TYPEW aa = *(TYPEW *)(va + HW(i));                              \
1288         *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm));           \
1289     }                                                                   \
1290 }
1291 
1292 DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1293            do_sqdmull_h, DO_SQADD_H)
1294 DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1295            do_sqdmull_s, DO_SQADD_S)
1296 DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1297            do_sqdmull_d, do_sqadd_d)
1298 
1299 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1300            do_sqdmull_h, DO_SQSUB_H)
1301 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1302            do_sqdmull_s, DO_SQSUB_S)
1303 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
1304            do_sqdmull_d, do_sqsub_d)
1305 
1306 #undef DO_SQDMLAL
1307 
1308 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1309 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1310 {                                                               \
1311     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);       \
1312     int rot = simd_data(desc);                                  \
1313     int sel_a = rot & 1, sel_b = sel_a ^ 1;                     \
1314     bool sub_r = rot == 1 || rot == 2;                          \
1315     bool sub_i = rot >= 2;                                      \
1316     TYPE *d = vd, *n = vn, *m = vm, *a = va;                    \
1317     for (i = 0; i < opr_sz; i += 2) {                           \
1318         TYPE elt1_a = n[H(i + sel_a)];                          \
1319         TYPE elt2_a = m[H(i + sel_a)];                          \
1320         TYPE elt2_b = m[H(i + sel_b)];                          \
1321         d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r);           \
1322         d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i);   \
1323     }                                                           \
1324 }
1325 
1326 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1327 
1328 DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1329 DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1330 DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1331 DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
1332 
1333 #define DO_SQRDMLAH_B(N, M, A, S) \
1334     do_sqrdmlah_b(N, M, A, S, true)
1335 #define DO_SQRDMLAH_H(N, M, A, S) \
1336     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1337 #define DO_SQRDMLAH_S(N, M, A, S) \
1338     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1339 #define DO_SQRDMLAH_D(N, M, A, S) \
1340     do_sqrdmlah_d(N, M, A, S, true)
1341 
1342 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1343 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1344 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1345 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
1346 
1347 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1348 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)    \
1349 {                                                                           \
1350     intptr_t i, j, oprsz = simd_oprsz(desc);                                \
1351     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);                          \
1352     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2;                  \
1353     int sel_a = rot & 1, sel_b = sel_a ^ 1;                                 \
1354     bool sub_r = rot == 1 || rot == 2;                                      \
1355     bool sub_i = rot >= 2;                                                  \
1356     TYPE *d = vd, *n = vn, *m = vm, *a = va;                                \
1357     for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) {         \
1358         TYPE elt2_a = m[H(i + idx + sel_a)];                                \
1359         TYPE elt2_b = m[H(i + idx + sel_b)];                                \
1360         for (j = 0; j < 16 / sizeof(TYPE); j += 2) {                        \
1361             TYPE elt1_a = n[H(i + j + sel_a)];                              \
1362             d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r);          \
1363             d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i);  \
1364         }                                                                   \
1365     }                                                                       \
1366 }
1367 
1368 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
1369 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
1370 
1371 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1372 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1373 
1374 #undef DO_CMLA
1375 #undef DO_CMLA_FUNC
1376 #undef DO_CMLA_IDX_FUNC
1377 #undef DO_SQRDMLAH_B
1378 #undef DO_SQRDMLAH_H
1379 #undef DO_SQRDMLAH_S
1380 #undef DO_SQRDMLAH_D
1381 
1382 /* Note N and M are 4 elements bundled into one unit. */
1383 static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
1384                          int sel_a, int sel_b, int sub_i)
1385 {
1386     for (int i = 0; i <= 1; i++) {
1387         int32_t elt1_r = (int8_t)(n >> (16 * i));
1388         int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
1389         int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
1390         int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
1391 
1392         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1393     }
1394     return a;
1395 }
1396 
1397 static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
1398                          int sel_a, int sel_b, int sub_i)
1399 {
1400     for (int i = 0; i <= 1; i++) {
1401         int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
1402         int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
1403         int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
1404         int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
1405 
1406         a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
1407     }
1408     return a;
1409 }
1410 
1411 void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
1412                               void *va, uint32_t desc)
1413 {
1414     int opr_sz = simd_oprsz(desc);
1415     int rot = simd_data(desc);
1416     int sel_a = rot & 1;
1417     int sel_b = sel_a ^ 1;
1418     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1419     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1420 
1421     for (int e = 0; e < opr_sz / 4; e++) {
1422         d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1423     }
1424 }
1425 
1426 void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
1427                               void *va, uint32_t desc)
1428 {
1429     int opr_sz = simd_oprsz(desc);
1430     int rot = simd_data(desc);
1431     int sel_a = rot & 1;
1432     int sel_b = sel_a ^ 1;
1433     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1434     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1435 
1436     for (int e = 0; e < opr_sz / 8; e++) {
1437         d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
1438     }
1439 }
1440 
1441 void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
1442                              void *va, uint32_t desc)
1443 {
1444     int opr_sz = simd_oprsz(desc);
1445     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1446     int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
1447     int sel_a = rot & 1;
1448     int sel_b = sel_a ^ 1;
1449     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1450     uint32_t *d = vd, *n = vn, *m = vm, *a = va;
1451 
1452     for (int seg = 0; seg < opr_sz / 4; seg += 4) {
1453         uint32_t seg_m = m[seg + idx];
1454         for (int e = 0; e < 4; e++) {
1455             d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
1456                                    sel_a, sel_b, sub_i);
1457         }
1458     }
1459 }
1460 
1461 void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
1462                              void *va, uint32_t desc)
1463 {
1464     int seg, opr_sz = simd_oprsz(desc);
1465     int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
1466     int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
1467     int sel_a = rot & 1;
1468     int sel_b = sel_a ^ 1;
1469     int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
1470     uint64_t *d = vd, *n = vn, *m = vm, *a = va;
1471 
1472     for (seg = 0; seg < opr_sz / 8; seg += 2) {
1473         uint64_t seg_m = m[seg + idx];
1474         for (int e = 0; e < 2; e++) {
1475             d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
1476                                    sel_a, sel_b, sub_i);
1477         }
1478     }
1479 }
1480 
1481 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1482 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1483 {                                                                       \
1484     intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE);     \
1485     intptr_t i, j, idx = simd_data(desc);                               \
1486     TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx);           \
1487     for (i = 0; i < oprsz / sizeof(TYPE); i += segment) {               \
1488         TYPE mm = m[i];                                                 \
1489         for (j = 0; j < segment; j++) {                                 \
1490             d[i + j] = OP(n[i + j], mm, a[i + j]);                      \
1491         }                                                               \
1492     }                                                                   \
1493 }
1494 
1495 #define DO_SQRDMLAH_H(N, M, A) \
1496     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1497 #define DO_SQRDMLAH_S(N, M, A) \
1498     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1499 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1500 
1501 DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
1502 DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
1503 DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
1504 
1505 #define DO_SQRDMLSH_H(N, M, A) \
1506     ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1507 #define DO_SQRDMLSH_S(N, M, A) \
1508     ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1509 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1510 
1511 DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
1512 DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
1513 DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
1514 
1515 #undef DO_ZZXZ
1516 
1517 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1518 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc)  \
1519 {                                                                         \
1520     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1521     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1522     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1523     for (i = 0; i < oprsz; i += 16) {                                     \
1524         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1525         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1526             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1527             TYPEW aa = *(TYPEW *)(va + HW(i + j));                        \
1528             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa);                  \
1529         }                                                                 \
1530     }                                                                     \
1531 }
1532 
1533 #define DO_MLA(N, M, A)  (A + N * M)
1534 
1535 DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
1536 DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
1537 DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
1538 DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
1539 
1540 #define DO_MLS(N, M, A)  (A - N * M)
1541 
1542 DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
1543 DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
1544 DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
1545 DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
1546 
1547 #define DO_SQDMLAL_S(N, M, A)  DO_SQADD_S(A, do_sqdmull_s(N, M))
1548 #define DO_SQDMLAL_D(N, M, A)  do_sqadd_d(A, do_sqdmull_d(N, M))
1549 
1550 DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
1551 DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
1552 
1553 #define DO_SQDMLSL_S(N, M, A)  DO_SQSUB_S(A, do_sqdmull_s(N, M))
1554 #define DO_SQDMLSL_D(N, M, A)  do_sqsub_d(A, do_sqdmull_d(N, M))
1555 
1556 DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
1557 DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
1558 
1559 #undef DO_MLA
1560 #undef DO_MLS
1561 #undef DO_ZZXW
1562 
1563 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1564 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)            \
1565 {                                                                         \
1566     intptr_t i, j, oprsz = simd_oprsz(desc);                              \
1567     intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN);   \
1568     intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1569     for (i = 0; i < oprsz; i += 16) {                                     \
1570         TYPEW mm = *(TYPEN *)(vm + HN(i + idx));                          \
1571         for (j = 0; j < 16; j += sizeof(TYPEW)) {                         \
1572             TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel));                  \
1573             *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm);                      \
1574         }                                                                 \
1575     }                                                                     \
1576 }
1577 
1578 DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1579 DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
1580 
1581 DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1582 DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
1583 
1584 DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1585 DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
1586 
1587 #undef DO_ZZX
1588 
1589 #define DO_BITPERM(NAME, TYPE, OP) \
1590 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1591 {                                                              \
1592     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1593     for (i = 0; i < opr_sz; i += sizeof(TYPE)) {               \
1594         TYPE nn = *(TYPE *)(vn + i);                           \
1595         TYPE mm = *(TYPE *)(vm + i);                           \
1596         *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8);      \
1597     }                                                          \
1598 }
1599 
1600 static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1601 {
1602     uint64_t res = 0;
1603     int db, rb = 0;
1604 
1605     for (db = 0; db < n; ++db) {
1606         if ((mask >> db) & 1) {
1607             res |= ((data >> db) & 1) << rb;
1608             ++rb;
1609         }
1610     }
1611     return res;
1612 }
1613 
1614 DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1615 DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1616 DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1617 DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1618 
1619 static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1620 {
1621     uint64_t res = 0;
1622     int rb, db = 0;
1623 
1624     for (rb = 0; rb < n; ++rb) {
1625         if ((mask >> rb) & 1) {
1626             res |= ((data >> db) & 1) << rb;
1627             ++db;
1628         }
1629     }
1630     return res;
1631 }
1632 
1633 DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1634 DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1635 DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1636 DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1637 
1638 static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1639 {
1640     uint64_t resm = 0, resu = 0;
1641     int db, rbm = 0, rbu = 0;
1642 
1643     for (db = 0; db < n; ++db) {
1644         uint64_t val = (data >> db) & 1;
1645         if ((mask >> db) & 1) {
1646             resm |= val << rbm++;
1647         } else {
1648             resu |= val << rbu++;
1649         }
1650     }
1651 
1652     return resm | (resu << rbm);
1653 }
1654 
1655 DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1656 DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1657 DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1658 DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1659 
1660 #undef DO_BITPERM
1661 
1662 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP)                  \
1663 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)  \
1664 {                                                               \
1665     intptr_t i, opr_sz = simd_oprsz(desc);                      \
1666     int sub_r = simd_data(desc);                                \
1667     if (sub_r) {                                                \
1668         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1669             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1670             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1671             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1672             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1673             acc_r = ADD_OP(acc_r, el2_i);                       \
1674             acc_i = SUB_OP(acc_i, el2_r);                       \
1675             *(TYPE *)(vd + H(i)) = acc_r;                       \
1676             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1677         }                                                       \
1678     } else {                                                    \
1679         for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) {        \
1680             TYPE acc_r = *(TYPE *)(vn + H(i));                  \
1681             TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE)));   \
1682             TYPE el2_r = *(TYPE *)(vm + H(i));                  \
1683             TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE)));   \
1684             acc_r = SUB_OP(acc_r, el2_i);                       \
1685             acc_i = ADD_OP(acc_i, el2_r);                       \
1686             *(TYPE *)(vd + H(i)) = acc_r;                       \
1687             *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i;        \
1688         }                                                       \
1689     }                                                           \
1690 }
1691 
1692 DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1693 DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1694 DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1695 DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
1696 
1697 DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1698 DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1699 DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1700 DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
1701 
1702 #undef DO_CADD
1703 
1704 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1705 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
1706 {                                                              \
1707     intptr_t i, opr_sz = simd_oprsz(desc);                     \
1708     intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN);      \
1709     int shift = simd_data(desc) >> 1;                          \
1710     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {              \
1711         TYPEW nn = *(TYPEN *)(vn + HN(i + sel));               \
1712         *(TYPEW *)(vd + HW(i)) = nn << shift;                  \
1713     }                                                          \
1714 }
1715 
1716 DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1717 DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1718 DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
1719 
1720 DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1721 DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1722 DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
1723 
1724 #undef DO_ZZI_SHLL
1725 
1726 /* Two-operand reduction expander, controlled by a predicate.
1727  * The difference between TYPERED and TYPERET has to do with
1728  * sign-extension.  E.g. for SMAX, TYPERED must be signed,
1729  * but TYPERET must be unsigned so that e.g. a 32-bit value
1730  * is not sign-extended to the ABI uint64_t return type.
1731  */
1732 /* ??? If we were to vectorize this by hand the reduction ordering
1733  * would change.  For integer operands, this is perfectly fine.
1734  */
1735 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1736 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1737 {                                                          \
1738     intptr_t i, opr_sz = simd_oprsz(desc);                 \
1739     TYPERED ret = INIT;                                    \
1740     for (i = 0; i < opr_sz; ) {                            \
1741         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));    \
1742         do {                                               \
1743             if (pg & 1) {                                  \
1744                 TYPEELT nn = *(TYPEELT *)(vn + H(i));      \
1745                 ret = OP(ret, nn);                         \
1746             }                                              \
1747             i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT);  \
1748         } while (i & 15);                                  \
1749     }                                                      \
1750     return (TYPERET)ret;                                   \
1751 }
1752 
1753 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP)             \
1754 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc)   \
1755 {                                                          \
1756     intptr_t i, opr_sz = simd_oprsz(desc) / 8;             \
1757     TYPEE *n = vn;                                         \
1758     uint8_t *pg = vg;                                      \
1759     TYPER ret = INIT;                                      \
1760     for (i = 0; i < opr_sz; i += 1) {                      \
1761         if (pg[H1(i)] & 1) {                               \
1762             TYPEE nn = n[i];                               \
1763             ret = OP(ret, nn);                             \
1764         }                                                  \
1765     }                                                      \
1766     return ret;                                            \
1767 }
1768 
1769 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1770 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1771 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1772 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1773 
1774 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1775 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1776 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1777 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1778 
1779 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1780 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1781 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1782 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1783 
1784 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1785 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1786 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1787 
1788 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1789 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1790 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1791 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1792 
1793 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1794 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1795 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1796 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1797 
1798 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1799 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1800 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1801 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1802 
1803 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1804 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1805 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1806 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1807 
1808 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1809 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1810 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1811 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1812 
1813 #undef DO_VPZ
1814 #undef DO_VPZ_D
1815 
1816 /* Two vector operand, one scalar operand, unpredicated.  */
1817 #define DO_ZZI(NAME, TYPE, OP)                                       \
1818 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc)   \
1819 {                                                                    \
1820     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE);            \
1821     TYPE s = s64, *d = vd, *n = vn;                                  \
1822     for (i = 0; i < opr_sz; ++i) {                                   \
1823         d[i] = OP(n[i], s);                                          \
1824     }                                                                \
1825 }
1826 
1827 #define DO_SUBR(X, Y)   (Y - X)
1828 
1829 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1830 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1831 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1832 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1833 
1834 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1835 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1836 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1837 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1838 
1839 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1840 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1841 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1842 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1843 
1844 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1845 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1846 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1847 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1848 
1849 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1850 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1851 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1852 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1853 
1854 #undef DO_ZZI
1855 
1856 #undef DO_AND
1857 #undef DO_ORR
1858 #undef DO_EOR
1859 #undef DO_BIC
1860 #undef DO_ADD
1861 #undef DO_SUB
1862 #undef DO_MAX
1863 #undef DO_MIN
1864 #undef DO_ABD
1865 #undef DO_MUL
1866 #undef DO_DIV
1867 #undef DO_ASR
1868 #undef DO_LSR
1869 #undef DO_LSL
1870 #undef DO_SUBR
1871 
1872 /* Similar to the ARM LastActiveElement pseudocode function, except the
1873    result is multiplied by the element size.  This includes the not found
1874    indication; e.g. not found for esz=3 is -8.  */
1875 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1876 {
1877     uint64_t mask = pred_esz_masks[esz];
1878     intptr_t i = words;
1879 
1880     do {
1881         uint64_t this_g = g[--i] & mask;
1882         if (this_g) {
1883             return i * 64 + (63 - clz64(this_g));
1884         }
1885     } while (i > 0);
1886     return (intptr_t)-1 << esz;
1887 }
1888 
1889 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
1890 {
1891     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1892     uint32_t flags = PREDTEST_INIT;
1893     uint64_t *d = vd, *g = vg;
1894     intptr_t i = 0;
1895 
1896     do {
1897         uint64_t this_d = d[i];
1898         uint64_t this_g = g[i];
1899 
1900         if (this_g) {
1901             if (!(flags & 4)) {
1902                 /* Set in D the first bit of G.  */
1903                 this_d |= this_g & -this_g;
1904                 d[i] = this_d;
1905             }
1906             flags = iter_predtest_fwd(this_d, this_g, flags);
1907         }
1908     } while (++i < words);
1909 
1910     return flags;
1911 }
1912 
1913 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1914 {
1915     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1916     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1917     uint32_t flags = PREDTEST_INIT;
1918     uint64_t *d = vd, *g = vg, esz_mask;
1919     intptr_t i, next;
1920 
1921     next = last_active_element(vd, words, esz) + (1 << esz);
1922     esz_mask = pred_esz_masks[esz];
1923 
1924     /* Similar to the pseudocode for pnext, but scaled by ESZ
1925        so that we find the correct bit.  */
1926     if (next < words * 64) {
1927         uint64_t mask = -1;
1928 
1929         if (next & 63) {
1930             mask = ~((1ull << (next & 63)) - 1);
1931             next &= -64;
1932         }
1933         do {
1934             uint64_t this_g = g[next / 64] & esz_mask & mask;
1935             if (this_g != 0) {
1936                 next = (next & -64) + ctz64(this_g);
1937                 break;
1938             }
1939             next += 64;
1940             mask = -1;
1941         } while (next < words * 64);
1942     }
1943 
1944     i = 0;
1945     do {
1946         uint64_t this_d = 0;
1947         if (i == next / 64) {
1948             this_d = 1ull << (next & 63);
1949         }
1950         d[i] = this_d;
1951         flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1952     } while (++i < words);
1953 
1954     return flags;
1955 }
1956 
1957 /*
1958  * Copy Zn into Zd, and store zero into inactive elements.
1959  * If inv, store zeros into the active elements.
1960  */
1961 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1962 {
1963     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1964     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1965     uint64_t *d = vd, *n = vn;
1966     uint8_t *pg = vg;
1967 
1968     for (i = 0; i < opr_sz; i += 1) {
1969         d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
1970     }
1971 }
1972 
1973 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1974 {
1975     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1976     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1977     uint64_t *d = vd, *n = vn;
1978     uint8_t *pg = vg;
1979 
1980     for (i = 0; i < opr_sz; i += 1) {
1981         d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
1982     }
1983 }
1984 
1985 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1986 {
1987     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1988     uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
1989     uint64_t *d = vd, *n = vn;
1990     uint8_t *pg = vg;
1991 
1992     for (i = 0; i < opr_sz; i += 1) {
1993         d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
1994     }
1995 }
1996 
1997 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1998 {
1999     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2000     uint64_t *d = vd, *n = vn;
2001     uint8_t *pg = vg;
2002     uint8_t inv = simd_data(desc);
2003 
2004     for (i = 0; i < opr_sz; i += 1) {
2005         d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
2006     }
2007 }
2008 
2009 /* Three-operand expander, immediate operand, controlled by a predicate.
2010  */
2011 #define DO_ZPZI(NAME, TYPE, H, OP)                              \
2012 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2013 {                                                               \
2014     intptr_t i, opr_sz = simd_oprsz(desc);                      \
2015     TYPE imm = simd_data(desc);                                 \
2016     for (i = 0; i < opr_sz; ) {                                 \
2017         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));         \
2018         do {                                                    \
2019             if (pg & 1) {                                       \
2020                 TYPE nn = *(TYPE *)(vn + H(i));                 \
2021                 *(TYPE *)(vd + H(i)) = OP(nn, imm);             \
2022             }                                                   \
2023             i += sizeof(TYPE), pg >>= sizeof(TYPE);             \
2024         } while (i & 15);                                       \
2025     }                                                           \
2026 }
2027 
2028 /* Similarly, specialized for 64-bit operands.  */
2029 #define DO_ZPZI_D(NAME, TYPE, OP)                               \
2030 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)  \
2031 {                                                               \
2032     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                  \
2033     TYPE *d = vd, *n = vn;                                      \
2034     TYPE imm = simd_data(desc);                                 \
2035     uint8_t *pg = vg;                                           \
2036     for (i = 0; i < opr_sz; i += 1) {                           \
2037         if (pg[H1(i)] & 1) {                                    \
2038             TYPE nn = n[i];                                     \
2039             d[i] = OP(nn, imm);                                 \
2040         }                                                       \
2041     }                                                           \
2042 }
2043 
2044 #define DO_SHR(N, M)  (N >> M)
2045 #define DO_SHL(N, M)  (N << M)
2046 
2047 /* Arithmetic shift right for division.  This rounds negative numbers
2048    toward zero as per signed division.  Therefore before shifting,
2049    when N is negative, add 2**M-1.  */
2050 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2051 
2052 static inline uint64_t do_urshr(uint64_t x, unsigned sh)
2053 {
2054     if (likely(sh < 64)) {
2055         return (x >> sh) + ((x >> (sh - 1)) & 1);
2056     } else if (sh == 64) {
2057         return x >> 63;
2058     } else {
2059         return 0;
2060     }
2061 }
2062 
2063 static inline int64_t do_srshr(int64_t x, unsigned sh)
2064 {
2065     if (likely(sh < 64)) {
2066         return (x >> sh) + ((x >> (sh - 1)) & 1);
2067     } else {
2068         /* Rounding the sign bit always produces 0. */
2069         return 0;
2070     }
2071 }
2072 
2073 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
2074 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
2075 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
2076 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
2077 
2078 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
2079 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
2080 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
2081 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
2082 
2083 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
2084 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
2085 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
2086 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2087 
2088 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2089 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2090 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2091 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2092 
2093 /* SVE2 bitwise shift by immediate */
2094 DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
2095 DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
2096 DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
2097 DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
2098 
2099 DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
2100 DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
2101 DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
2102 DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
2103 
2104 DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
2105 DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
2106 DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
2107 DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
2108 
2109 DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
2110 DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
2111 DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
2112 DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
2113 
2114 #define do_suqrshl_b(n, m) \
2115    ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
2116 #define do_suqrshl_h(n, m) \
2117    ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
2118 #define do_suqrshl_s(n, m) \
2119    ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
2120 #define do_suqrshl_d(n, m) \
2121    ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
2122 
2123 DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
2124 DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
2125 DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
2126 DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
2127 
2128 #undef DO_ASRD
2129 #undef DO_ZPZI
2130 #undef DO_ZPZI_D
2131 
2132 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2133 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)         \
2134 {                                                            \
2135     intptr_t i, opr_sz = simd_oprsz(desc);                   \
2136     int shift = simd_data(desc);                             \
2137     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {            \
2138         TYPEW nn = *(TYPEW *)(vn + i);                       \
2139         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift);           \
2140     }                                                        \
2141 }
2142 
2143 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP)                  \
2144 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)              \
2145 {                                                                 \
2146     intptr_t i, opr_sz = simd_oprsz(desc);                        \
2147     int shift = simd_data(desc);                                  \
2148     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                 \
2149         TYPEW nn = *(TYPEW *)(vn + HW(i));                        \
2150         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift);   \
2151     }                                                             \
2152 }
2153 
2154 DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2155 DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2156 DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2157 
2158 DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2159 DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2160 DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
2161 
2162 DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2163 DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2164 DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2165 
2166 DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2167 DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2168 DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
2169 
2170 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2171 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2172 #define DO_SQSHRUN_D(x, sh) \
2173     do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2174 
2175 DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2176 DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2177 DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2178 
2179 DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2180 DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2181 DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
2182 
2183 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2184 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2185 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2186 
2187 DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2188 DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2189 DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2190 
2191 DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2192 DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2193 DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
2194 
2195 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2196 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2197 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2198 
2199 DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2200 DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2201 DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2202 
2203 DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2204 DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2205 DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
2206 
2207 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2208 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2209 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2210 
2211 DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2212 DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2213 DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2214 
2215 DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2216 DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2217 DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
2218 
2219 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2220 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2221 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2222 
2223 DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2224 DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2225 DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2226 
2227 DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2228 DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2229 DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
2230 
2231 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2232 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2233 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2234 
2235 DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2236 DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2237 DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2238 
2239 DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2240 DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2241 DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
2242 
2243 #undef DO_SHRNB
2244 #undef DO_SHRNT
2245 
2246 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP)                           \
2247 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2248 {                                                                           \
2249     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2250     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2251         TYPEW nn = *(TYPEW *)(vn + i);                                      \
2252         TYPEW mm = *(TYPEW *)(vm + i);                                      \
2253         *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT);                      \
2254     }                                                                       \
2255 }
2256 
2257 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP)                   \
2258 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)              \
2259 {                                                                           \
2260     intptr_t i, opr_sz = simd_oprsz(desc);                                  \
2261     for (i = 0; i < opr_sz; i += sizeof(TYPEW)) {                           \
2262         TYPEW nn = *(TYPEW *)(vn + HW(i));                                  \
2263         TYPEW mm = *(TYPEW *)(vm + HW(i));                                  \
2264         *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT);         \
2265     }                                                                       \
2266 }
2267 
2268 #define DO_ADDHN(N, M, SH)  ((N + M) >> SH)
2269 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2270 #define DO_SUBHN(N, M, SH)  ((N - M) >> SH)
2271 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2272 
2273 DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2274 DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2275 DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2276 
2277 DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2278 DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2279 DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
2280 
2281 DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2282 DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2283 DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2284 
2285 DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2286 DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2287 DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
2288 
2289 DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
2290 DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
2291 DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
2292 
2293 DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
2294 DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
2295 DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
2296 
2297 DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
2298 DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
2299 DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
2300 
2301 DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
2302 DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
2303 DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
2304 
2305 #undef DO_RSUBHN
2306 #undef DO_SUBHN
2307 #undef DO_RADDHN
2308 #undef DO_ADDHN
2309 
2310 #undef DO_BINOPNB
2311 
2312 /* Fully general four-operand expander, controlled by a predicate.
2313  */
2314 #define DO_ZPZZZ(NAME, TYPE, H, OP)                           \
2315 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2316                   void *vg, uint32_t desc)                    \
2317 {                                                             \
2318     intptr_t i, opr_sz = simd_oprsz(desc);                    \
2319     for (i = 0; i < opr_sz; ) {                               \
2320         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));       \
2321         do {                                                  \
2322             if (pg & 1) {                                     \
2323                 TYPE nn = *(TYPE *)(vn + H(i));               \
2324                 TYPE mm = *(TYPE *)(vm + H(i));               \
2325                 TYPE aa = *(TYPE *)(va + H(i));               \
2326                 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm);        \
2327             }                                                 \
2328             i += sizeof(TYPE), pg >>= sizeof(TYPE);           \
2329         } while (i & 15);                                     \
2330     }                                                         \
2331 }
2332 
2333 /* Similarly, specialized for 64-bit operands.  */
2334 #define DO_ZPZZZ_D(NAME, TYPE, OP)                            \
2335 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm,     \
2336                   void *vg, uint32_t desc)                    \
2337 {                                                             \
2338     intptr_t i, opr_sz = simd_oprsz(desc) / 8;                \
2339     TYPE *d = vd, *a = va, *n = vn, *m = vm;                  \
2340     uint8_t *pg = vg;                                         \
2341     for (i = 0; i < opr_sz; i += 1) {                         \
2342         if (pg[H1(i)] & 1) {                                  \
2343             TYPE aa = a[i], nn = n[i], mm = m[i];             \
2344             d[i] = OP(aa, nn, mm);                            \
2345         }                                                     \
2346     }                                                         \
2347 }
2348 
2349 #define DO_MLA(A, N, M)  (A + N * M)
2350 #define DO_MLS(A, N, M)  (A - N * M)
2351 
2352 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2353 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2354 
2355 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2356 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2357 
2358 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2359 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2360 
2361 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2362 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2363 
2364 #undef DO_MLA
2365 #undef DO_MLS
2366 #undef DO_ZPZZZ
2367 #undef DO_ZPZZZ_D
2368 
2369 void HELPER(sve_index_b)(void *vd, uint32_t start,
2370                          uint32_t incr, uint32_t desc)
2371 {
2372     intptr_t i, opr_sz = simd_oprsz(desc);
2373     uint8_t *d = vd;
2374     for (i = 0; i < opr_sz; i += 1) {
2375         d[H1(i)] = start + i * incr;
2376     }
2377 }
2378 
2379 void HELPER(sve_index_h)(void *vd, uint32_t start,
2380                          uint32_t incr, uint32_t desc)
2381 {
2382     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2383     uint16_t *d = vd;
2384     for (i = 0; i < opr_sz; i += 1) {
2385         d[H2(i)] = start + i * incr;
2386     }
2387 }
2388 
2389 void HELPER(sve_index_s)(void *vd, uint32_t start,
2390                          uint32_t incr, uint32_t desc)
2391 {
2392     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2393     uint32_t *d = vd;
2394     for (i = 0; i < opr_sz; i += 1) {
2395         d[H4(i)] = start + i * incr;
2396     }
2397 }
2398 
2399 void HELPER(sve_index_d)(void *vd, uint64_t start,
2400                          uint64_t incr, uint32_t desc)
2401 {
2402     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2403     uint64_t *d = vd;
2404     for (i = 0; i < opr_sz; i += 1) {
2405         d[i] = start + i * incr;
2406     }
2407 }
2408 
2409 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2410 {
2411     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2412     uint32_t sh = simd_data(desc);
2413     uint32_t *d = vd, *n = vn, *m = vm;
2414     for (i = 0; i < opr_sz; i += 1) {
2415         d[i] = n[i] + (m[i] << sh);
2416     }
2417 }
2418 
2419 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2420 {
2421     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2422     uint64_t sh = simd_data(desc);
2423     uint64_t *d = vd, *n = vn, *m = vm;
2424     for (i = 0; i < opr_sz; i += 1) {
2425         d[i] = n[i] + (m[i] << sh);
2426     }
2427 }
2428 
2429 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2430 {
2431     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2432     uint64_t sh = simd_data(desc);
2433     uint64_t *d = vd, *n = vn, *m = vm;
2434     for (i = 0; i < opr_sz; i += 1) {
2435         d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2436     }
2437 }
2438 
2439 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2440 {
2441     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2442     uint64_t sh = simd_data(desc);
2443     uint64_t *d = vd, *n = vn, *m = vm;
2444     for (i = 0; i < opr_sz; i += 1) {
2445         d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2446     }
2447 }
2448 
2449 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2450 {
2451     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2452     static const uint16_t coeff[] = {
2453         0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2454         0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2455         0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2456         0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2457     };
2458     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2459     uint16_t *d = vd, *n = vn;
2460 
2461     for (i = 0; i < opr_sz; i++) {
2462         uint16_t nn = n[i];
2463         intptr_t idx = extract32(nn, 0, 5);
2464         uint16_t exp = extract32(nn, 5, 5);
2465         d[i] = coeff[idx] | (exp << 10);
2466     }
2467 }
2468 
2469 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2470 {
2471     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2472     static const uint32_t coeff[] = {
2473         0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2474         0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2475         0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2476         0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2477         0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2478         0x1ef532, 0x20b051, 0x227043, 0x243516,
2479         0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2480         0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2481         0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2482         0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2483         0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2484         0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2485         0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2486         0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2487         0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2488         0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2489     };
2490     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2491     uint32_t *d = vd, *n = vn;
2492 
2493     for (i = 0; i < opr_sz; i++) {
2494         uint32_t nn = n[i];
2495         intptr_t idx = extract32(nn, 0, 6);
2496         uint32_t exp = extract32(nn, 6, 8);
2497         d[i] = coeff[idx] | (exp << 23);
2498     }
2499 }
2500 
2501 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2502 {
2503     /* These constants are cut-and-paste directly from the ARM pseudocode.  */
2504     static const uint64_t coeff[] = {
2505         0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2506         0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2507         0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2508         0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2509         0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2510         0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2511         0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2512         0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2513         0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2514         0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2515         0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2516         0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2517         0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2518         0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2519         0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2520         0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2521         0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2522         0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2523         0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2524         0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2525         0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2526         0xFA7C1819E90D8ull,
2527     };
2528     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2529     uint64_t *d = vd, *n = vn;
2530 
2531     for (i = 0; i < opr_sz; i++) {
2532         uint64_t nn = n[i];
2533         intptr_t idx = extract32(nn, 0, 6);
2534         uint64_t exp = extract32(nn, 6, 11);
2535         d[i] = coeff[idx] | (exp << 52);
2536     }
2537 }
2538 
2539 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2540 {
2541     intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2542     uint16_t *d = vd, *n = vn, *m = vm;
2543     for (i = 0; i < opr_sz; i += 1) {
2544         uint16_t nn = n[i];
2545         uint16_t mm = m[i];
2546         if (mm & 1) {
2547             nn = float16_one;
2548         }
2549         d[i] = nn ^ (mm & 2) << 14;
2550     }
2551 }
2552 
2553 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2554 {
2555     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2556     uint32_t *d = vd, *n = vn, *m = vm;
2557     for (i = 0; i < opr_sz; i += 1) {
2558         uint32_t nn = n[i];
2559         uint32_t mm = m[i];
2560         if (mm & 1) {
2561             nn = float32_one;
2562         }
2563         d[i] = nn ^ (mm & 2) << 30;
2564     }
2565 }
2566 
2567 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2568 {
2569     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2570     uint64_t *d = vd, *n = vn, *m = vm;
2571     for (i = 0; i < opr_sz; i += 1) {
2572         uint64_t nn = n[i];
2573         uint64_t mm = m[i];
2574         if (mm & 1) {
2575             nn = float64_one;
2576         }
2577         d[i] = nn ^ (mm & 2) << 62;
2578     }
2579 }
2580 
2581 /*
2582  * Signed saturating addition with scalar operand.
2583  */
2584 
2585 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2586 {
2587     intptr_t i, oprsz = simd_oprsz(desc);
2588 
2589     for (i = 0; i < oprsz; i += sizeof(int8_t)) {
2590         *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
2591     }
2592 }
2593 
2594 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2595 {
2596     intptr_t i, oprsz = simd_oprsz(desc);
2597 
2598     for (i = 0; i < oprsz; i += sizeof(int16_t)) {
2599         *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
2600     }
2601 }
2602 
2603 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2604 {
2605     intptr_t i, oprsz = simd_oprsz(desc);
2606 
2607     for (i = 0; i < oprsz; i += sizeof(int32_t)) {
2608         *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
2609     }
2610 }
2611 
2612 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2613 {
2614     intptr_t i, oprsz = simd_oprsz(desc);
2615 
2616     for (i = 0; i < oprsz; i += sizeof(int64_t)) {
2617         *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
2618     }
2619 }
2620 
2621 /*
2622  * Unsigned saturating addition with scalar operand.
2623  */
2624 
2625 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2626 {
2627     intptr_t i, oprsz = simd_oprsz(desc);
2628 
2629     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
2630         *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
2631     }
2632 }
2633 
2634 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2635 {
2636     intptr_t i, oprsz = simd_oprsz(desc);
2637 
2638     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
2639         *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
2640     }
2641 }
2642 
2643 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2644 {
2645     intptr_t i, oprsz = simd_oprsz(desc);
2646 
2647     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
2648         *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
2649     }
2650 }
2651 
2652 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2653 {
2654     intptr_t i, oprsz = simd_oprsz(desc);
2655 
2656     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2657         *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
2658     }
2659 }
2660 
2661 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2662 {
2663     intptr_t i, oprsz = simd_oprsz(desc);
2664 
2665     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
2666         *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
2667     }
2668 }
2669 
2670 /* Two operand predicated copy immediate with merge.  All valid immediates
2671  * can fit within 17 signed bits in the simd_data field.
2672  */
2673 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2674                          uint64_t mm, uint32_t desc)
2675 {
2676     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2677     uint64_t *d = vd, *n = vn;
2678     uint8_t *pg = vg;
2679 
2680     mm = dup_const(MO_8, mm);
2681     for (i = 0; i < opr_sz; i += 1) {
2682         uint64_t nn = n[i];
2683         uint64_t pp = expand_pred_b(pg[H1(i)]);
2684         d[i] = (mm & pp) | (nn & ~pp);
2685     }
2686 }
2687 
2688 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2689                          uint64_t mm, uint32_t desc)
2690 {
2691     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2692     uint64_t *d = vd, *n = vn;
2693     uint8_t *pg = vg;
2694 
2695     mm = dup_const(MO_16, mm);
2696     for (i = 0; i < opr_sz; i += 1) {
2697         uint64_t nn = n[i];
2698         uint64_t pp = expand_pred_h(pg[H1(i)]);
2699         d[i] = (mm & pp) | (nn & ~pp);
2700     }
2701 }
2702 
2703 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2704                          uint64_t mm, uint32_t desc)
2705 {
2706     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2707     uint64_t *d = vd, *n = vn;
2708     uint8_t *pg = vg;
2709 
2710     mm = dup_const(MO_32, mm);
2711     for (i = 0; i < opr_sz; i += 1) {
2712         uint64_t nn = n[i];
2713         uint64_t pp = expand_pred_s(pg[H1(i)]);
2714         d[i] = (mm & pp) | (nn & ~pp);
2715     }
2716 }
2717 
2718 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2719                          uint64_t mm, uint32_t desc)
2720 {
2721     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2722     uint64_t *d = vd, *n = vn;
2723     uint8_t *pg = vg;
2724 
2725     for (i = 0; i < opr_sz; i += 1) {
2726         uint64_t nn = n[i];
2727         d[i] = (pg[H1(i)] & 1 ? mm : nn);
2728     }
2729 }
2730 
2731 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2732 {
2733     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2734     uint64_t *d = vd;
2735     uint8_t *pg = vg;
2736 
2737     val = dup_const(MO_8, val);
2738     for (i = 0; i < opr_sz; i += 1) {
2739         d[i] = val & expand_pred_b(pg[H1(i)]);
2740     }
2741 }
2742 
2743 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2744 {
2745     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2746     uint64_t *d = vd;
2747     uint8_t *pg = vg;
2748 
2749     val = dup_const(MO_16, val);
2750     for (i = 0; i < opr_sz; i += 1) {
2751         d[i] = val & expand_pred_h(pg[H1(i)]);
2752     }
2753 }
2754 
2755 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2756 {
2757     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2758     uint64_t *d = vd;
2759     uint8_t *pg = vg;
2760 
2761     val = dup_const(MO_32, val);
2762     for (i = 0; i < opr_sz; i += 1) {
2763         d[i] = val & expand_pred_s(pg[H1(i)]);
2764     }
2765 }
2766 
2767 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2768 {
2769     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2770     uint64_t *d = vd;
2771     uint8_t *pg = vg;
2772 
2773     for (i = 0; i < opr_sz; i += 1) {
2774         d[i] = (pg[H1(i)] & 1 ? val : 0);
2775     }
2776 }
2777 
2778 /* Big-endian hosts need to frob the byte indices.  If the copy
2779  * happens to be 8-byte aligned, then no frobbing necessary.
2780  */
2781 static void swap_memmove(void *vd, void *vs, size_t n)
2782 {
2783     uintptr_t d = (uintptr_t)vd;
2784     uintptr_t s = (uintptr_t)vs;
2785     uintptr_t o = (d | s | n) & 7;
2786     size_t i;
2787 
2788 #if !HOST_BIG_ENDIAN
2789     o = 0;
2790 #endif
2791     switch (o) {
2792     case 0:
2793         memmove(vd, vs, n);
2794         break;
2795 
2796     case 4:
2797         if (d < s || d >= s + n) {
2798             for (i = 0; i < n; i += 4) {
2799                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2800             }
2801         } else {
2802             for (i = n; i > 0; ) {
2803                 i -= 4;
2804                 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2805             }
2806         }
2807         break;
2808 
2809     case 2:
2810     case 6:
2811         if (d < s || d >= s + n) {
2812             for (i = 0; i < n; i += 2) {
2813                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2814             }
2815         } else {
2816             for (i = n; i > 0; ) {
2817                 i -= 2;
2818                 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2819             }
2820         }
2821         break;
2822 
2823     default:
2824         if (d < s || d >= s + n) {
2825             for (i = 0; i < n; i++) {
2826                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2827             }
2828         } else {
2829             for (i = n; i > 0; ) {
2830                 i -= 1;
2831                 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2832             }
2833         }
2834         break;
2835     }
2836 }
2837 
2838 /* Similarly for memset of 0.  */
2839 static void swap_memzero(void *vd, size_t n)
2840 {
2841     uintptr_t d = (uintptr_t)vd;
2842     uintptr_t o = (d | n) & 7;
2843     size_t i;
2844 
2845     /* Usually, the first bit of a predicate is set, so N is 0.  */
2846     if (likely(n == 0)) {
2847         return;
2848     }
2849 
2850 #if !HOST_BIG_ENDIAN
2851     o = 0;
2852 #endif
2853     switch (o) {
2854     case 0:
2855         memset(vd, 0, n);
2856         break;
2857 
2858     case 4:
2859         for (i = 0; i < n; i += 4) {
2860             *(uint32_t *)H1_4(d + i) = 0;
2861         }
2862         break;
2863 
2864     case 2:
2865     case 6:
2866         for (i = 0; i < n; i += 2) {
2867             *(uint16_t *)H1_2(d + i) = 0;
2868         }
2869         break;
2870 
2871     default:
2872         for (i = 0; i < n; i++) {
2873             *(uint8_t *)H1(d + i) = 0;
2874         }
2875         break;
2876     }
2877 }
2878 
2879 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2880 {
2881     intptr_t opr_sz = simd_oprsz(desc);
2882     size_t n_ofs = simd_data(desc);
2883     size_t n_siz = opr_sz - n_ofs;
2884 
2885     if (vd != vm) {
2886         swap_memmove(vd, vn + n_ofs, n_siz);
2887         swap_memmove(vd + n_siz, vm, n_ofs);
2888     } else if (vd != vn) {
2889         swap_memmove(vd + n_siz, vd, n_ofs);
2890         swap_memmove(vd, vn + n_ofs, n_siz);
2891     } else {
2892         /* vd == vn == vm.  Need temp space.  */
2893         ARMVectorReg tmp;
2894         swap_memmove(&tmp, vm, n_ofs);
2895         swap_memmove(vd, vd + n_ofs, n_siz);
2896         memcpy(vd + n_siz, &tmp, n_ofs);
2897     }
2898 }
2899 
2900 #define DO_INSR(NAME, TYPE, H) \
2901 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2902 {                                                                  \
2903     intptr_t opr_sz = simd_oprsz(desc);                            \
2904     swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE));    \
2905     *(TYPE *)(vd + H(0)) = val;                                    \
2906 }
2907 
2908 DO_INSR(sve_insr_b, uint8_t, H1)
2909 DO_INSR(sve_insr_h, uint16_t, H1_2)
2910 DO_INSR(sve_insr_s, uint32_t, H1_4)
2911 DO_INSR(sve_insr_d, uint64_t, H1_8)
2912 
2913 #undef DO_INSR
2914 
2915 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2916 {
2917     intptr_t i, j, opr_sz = simd_oprsz(desc);
2918     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2919         uint64_t f = *(uint64_t *)(vn + i);
2920         uint64_t b = *(uint64_t *)(vn + j);
2921         *(uint64_t *)(vd + i) = bswap64(b);
2922         *(uint64_t *)(vd + j) = bswap64(f);
2923     }
2924 }
2925 
2926 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2927 {
2928     intptr_t i, j, opr_sz = simd_oprsz(desc);
2929     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2930         uint64_t f = *(uint64_t *)(vn + i);
2931         uint64_t b = *(uint64_t *)(vn + j);
2932         *(uint64_t *)(vd + i) = hswap64(b);
2933         *(uint64_t *)(vd + j) = hswap64(f);
2934     }
2935 }
2936 
2937 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2938 {
2939     intptr_t i, j, opr_sz = simd_oprsz(desc);
2940     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2941         uint64_t f = *(uint64_t *)(vn + i);
2942         uint64_t b = *(uint64_t *)(vn + j);
2943         *(uint64_t *)(vd + i) = rol64(b, 32);
2944         *(uint64_t *)(vd + j) = rol64(f, 32);
2945     }
2946 }
2947 
2948 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2949 {
2950     intptr_t i, j, opr_sz = simd_oprsz(desc);
2951     for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2952         uint64_t f = *(uint64_t *)(vn + i);
2953         uint64_t b = *(uint64_t *)(vn + j);
2954         *(uint64_t *)(vd + i) = b;
2955         *(uint64_t *)(vd + j) = f;
2956     }
2957 }
2958 
2959 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
2960 
2961 static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
2962                            bool is_tbx, tb_impl_fn *fn)
2963 {
2964     ARMVectorReg scratch;
2965     uintptr_t oprsz = simd_oprsz(desc);
2966 
2967     if (unlikely(vd == vn)) {
2968         vn = memcpy(&scratch, vn, oprsz);
2969     }
2970 
2971     fn(vd, vn, NULL, vm, oprsz, is_tbx);
2972 }
2973 
2974 static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
2975                            uint32_t desc, bool is_tbx, tb_impl_fn *fn)
2976 {
2977     ARMVectorReg scratch;
2978     uintptr_t oprsz = simd_oprsz(desc);
2979 
2980     if (unlikely(vd == vn0)) {
2981         vn0 = memcpy(&scratch, vn0, oprsz);
2982         if (vd == vn1) {
2983             vn1 = vn0;
2984         }
2985     } else if (unlikely(vd == vn1)) {
2986         vn1 = memcpy(&scratch, vn1, oprsz);
2987     }
2988 
2989     fn(vd, vn0, vn1, vm, oprsz, is_tbx);
2990 }
2991 
2992 #define DO_TB(SUFF, TYPE, H)                                            \
2993 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1,         \
2994                                 void *vm, uintptr_t oprsz, bool is_tbx) \
2995 {                                                                       \
2996     TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm;              \
2997     uintptr_t i, nelem = oprsz / sizeof(TYPE);                          \
2998     for (i = 0; i < nelem; ++i) {                                       \
2999         TYPE index = indexes[H1(i)], val = 0;                           \
3000         if (index < nelem) {                                            \
3001             val = tbl0[H(index)];                                       \
3002         } else {                                                        \
3003             index -= nelem;                                             \
3004             if (tbl1 && index < nelem) {                                \
3005                 val = tbl1[H(index)];                                   \
3006             } else if (is_tbx) {                                        \
3007                 continue;                                               \
3008             }                                                           \
3009         }                                                               \
3010         d[H(i)] = val;                                                  \
3011     }                                                                   \
3012 }                                                                       \
3013 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3014 {                                                                       \
3015     do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF);                     \
3016 }                                                                       \
3017 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1,            \
3018                              void *vm, uint32_t desc)                   \
3019 {                                                                       \
3020     do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF);               \
3021 }                                                                       \
3022 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3023 {                                                                       \
3024     do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF);                      \
3025 }
3026 
3027 DO_TB(b, uint8_t, H1)
3028 DO_TB(h, uint16_t, H2)
3029 DO_TB(s, uint32_t, H4)
3030 DO_TB(d, uint64_t, H8)
3031 
3032 #undef DO_TB
3033 
3034 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3035 void HELPER(NAME)(void *vd, void *vn, uint32_t desc)           \
3036 {                                                              \
3037     intptr_t i, opr_sz = simd_oprsz(desc);                     \
3038     TYPED *d = vd;                                             \
3039     TYPES *n = vn;                                             \
3040     ARMVectorReg tmp;                                          \
3041     if (unlikely(vn - vd < opr_sz)) {                          \
3042         n = memcpy(&tmp, n, opr_sz / 2);                       \
3043     }                                                          \
3044     for (i = 0; i < opr_sz / sizeof(TYPED); i++) {             \
3045         d[HD(i)] = n[HS(i)];                                   \
3046     }                                                          \
3047 }
3048 
3049 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
3050 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
3051 DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
3052 
3053 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
3054 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
3055 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
3056 
3057 #undef DO_UNPK
3058 
3059 /* Mask of bits included in the even numbered predicates of width esz.
3060  * We also use this for expand_bits/compress_bits, and so extend the
3061  * same pattern out to 16-bit units.
3062  */
3063 static const uint64_t even_bit_esz_masks[5] = {
3064     0x5555555555555555ull,
3065     0x3333333333333333ull,
3066     0x0f0f0f0f0f0f0f0full,
3067     0x00ff00ff00ff00ffull,
3068     0x0000ffff0000ffffull,
3069 };
3070 
3071 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3072  * For N==0, this corresponds to the operation that in qemu/bitops.h
3073  * we call half_shuffle64; this algorithm is from Hacker's Delight,
3074  * section 7-2 Shuffling Bits.
3075  */
3076 static uint64_t expand_bits(uint64_t x, int n)
3077 {
3078     int i;
3079 
3080     x &= 0xffffffffu;
3081     for (i = 4; i >= n; i--) {
3082         int sh = 1 << i;
3083         x = ((x << sh) | x) & even_bit_esz_masks[i];
3084     }
3085     return x;
3086 }
3087 
3088 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3089  * For N==0, this corresponds to the operation that in qemu/bitops.h
3090  * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3091  * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3092  */
3093 static uint64_t compress_bits(uint64_t x, int n)
3094 {
3095     int i;
3096 
3097     for (i = n; i <= 4; i++) {
3098         int sh = 1 << i;
3099         x &= even_bit_esz_masks[i];
3100         x = (x >> sh) | x;
3101     }
3102     return x & 0xffffffffu;
3103 }
3104 
3105 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3106 {
3107     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3108     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3109     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3110     int esize = 1 << esz;
3111     uint64_t *d = vd;
3112     intptr_t i;
3113 
3114     if (oprsz <= 8) {
3115         uint64_t nn = *(uint64_t *)vn;
3116         uint64_t mm = *(uint64_t *)vm;
3117         int half = 4 * oprsz;
3118 
3119         nn = extract64(nn, high * half, half);
3120         mm = extract64(mm, high * half, half);
3121         nn = expand_bits(nn, esz);
3122         mm = expand_bits(mm, esz);
3123         d[0] = nn | (mm << esize);
3124     } else {
3125         ARMPredicateReg tmp;
3126 
3127         /* We produce output faster than we consume input.
3128            Therefore we must be mindful of possible overlap.  */
3129         if (vd == vn) {
3130             vn = memcpy(&tmp, vn, oprsz);
3131             if (vd == vm) {
3132                 vm = vn;
3133             }
3134         } else if (vd == vm) {
3135             vm = memcpy(&tmp, vm, oprsz);
3136         }
3137         if (high) {
3138             high = oprsz >> 1;
3139         }
3140 
3141         if ((oprsz & 7) == 0) {
3142             uint32_t *n = vn, *m = vm;
3143             high >>= 2;
3144 
3145             for (i = 0; i < oprsz / 8; i++) {
3146                 uint64_t nn = n[H4(high + i)];
3147                 uint64_t mm = m[H4(high + i)];
3148 
3149                 nn = expand_bits(nn, esz);
3150                 mm = expand_bits(mm, esz);
3151                 d[i] = nn | (mm << esize);
3152             }
3153         } else {
3154             uint8_t *n = vn, *m = vm;
3155             uint16_t *d16 = vd;
3156 
3157             for (i = 0; i < oprsz / 2; i++) {
3158                 uint16_t nn = n[H1(high + i)];
3159                 uint16_t mm = m[H1(high + i)];
3160 
3161                 nn = expand_bits(nn, esz);
3162                 mm = expand_bits(mm, esz);
3163                 d16[H2(i)] = nn | (mm << esize);
3164             }
3165         }
3166     }
3167 }
3168 
3169 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3170 {
3171     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3172     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3173     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
3174     uint64_t *d = vd, *n = vn, *m = vm;
3175     uint64_t l, h;
3176     intptr_t i;
3177 
3178     if (oprsz <= 8) {
3179         l = compress_bits(n[0] >> odd, esz);
3180         h = compress_bits(m[0] >> odd, esz);
3181         d[0] = l | (h << (4 * oprsz));
3182     } else {
3183         ARMPredicateReg tmp_m;
3184         intptr_t oprsz_16 = oprsz / 16;
3185 
3186         if ((vm - vd) < (uintptr_t)oprsz) {
3187             m = memcpy(&tmp_m, vm, oprsz);
3188         }
3189 
3190         for (i = 0; i < oprsz_16; i++) {
3191             l = n[2 * i + 0];
3192             h = n[2 * i + 1];
3193             l = compress_bits(l >> odd, esz);
3194             h = compress_bits(h >> odd, esz);
3195             d[i] = l | (h << 32);
3196         }
3197 
3198         /*
3199          * For VL which is not a multiple of 512, the results from M do not
3200          * align nicely with the uint64_t for D.  Put the aligned results
3201          * from M into TMP_M and then copy it into place afterward.
3202          */
3203         if (oprsz & 15) {
3204             int final_shift = (oprsz & 15) * 2;
3205 
3206             l = n[2 * i + 0];
3207             h = n[2 * i + 1];
3208             l = compress_bits(l >> odd, esz);
3209             h = compress_bits(h >> odd, esz);
3210             d[i] = l | (h << final_shift);
3211 
3212             for (i = 0; i < oprsz_16; i++) {
3213                 l = m[2 * i + 0];
3214                 h = m[2 * i + 1];
3215                 l = compress_bits(l >> odd, esz);
3216                 h = compress_bits(h >> odd, esz);
3217                 tmp_m.p[i] = l | (h << 32);
3218             }
3219             l = m[2 * i + 0];
3220             h = m[2 * i + 1];
3221             l = compress_bits(l >> odd, esz);
3222             h = compress_bits(h >> odd, esz);
3223             tmp_m.p[i] = l | (h << final_shift);
3224 
3225             swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3226         } else {
3227             for (i = 0; i < oprsz_16; i++) {
3228                 l = m[2 * i + 0];
3229                 h = m[2 * i + 1];
3230                 l = compress_bits(l >> odd, esz);
3231                 h = compress_bits(h >> odd, esz);
3232                 d[oprsz_16 + i] = l | (h << 32);
3233             }
3234         }
3235     }
3236 }
3237 
3238 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3239 {
3240     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3241     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3242     int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
3243     uint64_t *d = vd, *n = vn, *m = vm;
3244     uint64_t mask;
3245     int shr, shl;
3246     intptr_t i;
3247 
3248     shl = 1 << esz;
3249     shr = 0;
3250     mask = even_bit_esz_masks[esz];
3251     if (odd) {
3252         mask <<= shl;
3253         shr = shl;
3254         shl = 0;
3255     }
3256 
3257     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3258         uint64_t nn = (n[i] & mask) >> shr;
3259         uint64_t mm = (m[i] & mask) << shl;
3260         d[i] = nn + mm;
3261     }
3262 }
3263 
3264 /* Reverse units of 2**N bits.  */
3265 static uint64_t reverse_bits_64(uint64_t x, int n)
3266 {
3267     int i, sh;
3268 
3269     x = bswap64(x);
3270     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3271         uint64_t mask = even_bit_esz_masks[i];
3272         x = ((x & mask) << sh) | ((x >> sh) & mask);
3273     }
3274     return x;
3275 }
3276 
3277 static uint8_t reverse_bits_8(uint8_t x, int n)
3278 {
3279     static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3280     int i, sh;
3281 
3282     for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3283         x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3284     }
3285     return x;
3286 }
3287 
3288 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3289 {
3290     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3291     int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3292     intptr_t i, oprsz_2 = oprsz / 2;
3293 
3294     if (oprsz <= 8) {
3295         uint64_t l = *(uint64_t *)vn;
3296         l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3297         *(uint64_t *)vd = l;
3298     } else if ((oprsz & 15) == 0) {
3299         for (i = 0; i < oprsz_2; i += 8) {
3300             intptr_t ih = oprsz - 8 - i;
3301             uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3302             uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3303             *(uint64_t *)(vd + i) = h;
3304             *(uint64_t *)(vd + ih) = l;
3305         }
3306     } else {
3307         for (i = 0; i < oprsz_2; i += 1) {
3308             intptr_t il = H1(i);
3309             intptr_t ih = H1(oprsz - 1 - i);
3310             uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3311             uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3312             *(uint8_t *)(vd + il) = h;
3313             *(uint8_t *)(vd + ih) = l;
3314         }
3315     }
3316 }
3317 
3318 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3319 {
3320     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3321     intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
3322     uint64_t *d = vd;
3323     intptr_t i;
3324 
3325     if (oprsz <= 8) {
3326         uint64_t nn = *(uint64_t *)vn;
3327         int half = 4 * oprsz;
3328 
3329         nn = extract64(nn, high * half, half);
3330         nn = expand_bits(nn, 0);
3331         d[0] = nn;
3332     } else {
3333         ARMPredicateReg tmp_n;
3334 
3335         /* We produce output faster than we consume input.
3336            Therefore we must be mindful of possible overlap.  */
3337         if ((vn - vd) < (uintptr_t)oprsz) {
3338             vn = memcpy(&tmp_n, vn, oprsz);
3339         }
3340         if (high) {
3341             high = oprsz >> 1;
3342         }
3343 
3344         if ((oprsz & 7) == 0) {
3345             uint32_t *n = vn;
3346             high >>= 2;
3347 
3348             for (i = 0; i < oprsz / 8; i++) {
3349                 uint64_t nn = n[H4(high + i)];
3350                 d[i] = expand_bits(nn, 0);
3351             }
3352         } else {
3353             uint16_t *d16 = vd;
3354             uint8_t *n = vn;
3355 
3356             for (i = 0; i < oprsz / 2; i++) {
3357                 uint16_t nn = n[H1(high + i)];
3358                 d16[H2(i)] = expand_bits(nn, 0);
3359             }
3360         }
3361     }
3362 }
3363 
3364 #define DO_ZIP(NAME, TYPE, H) \
3365 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)       \
3366 {                                                                    \
3367     intptr_t oprsz = simd_oprsz(desc);                               \
3368     intptr_t odd_ofs = simd_data(desc);                              \
3369     intptr_t i, oprsz_2 = oprsz / 2;                                 \
3370     ARMVectorReg tmp_n, tmp_m;                                       \
3371     /* We produce output faster than we consume input.               \
3372        Therefore we must be mindful of possible overlap.  */         \
3373     if (unlikely((vn - vd) < (uintptr_t)oprsz)) {                    \
3374         vn = memcpy(&tmp_n, vn, oprsz);                              \
3375     }                                                                \
3376     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                    \
3377         vm = memcpy(&tmp_m, vm, oprsz);                              \
3378     }                                                                \
3379     for (i = 0; i < oprsz_2; i += sizeof(TYPE)) {                    \
3380         *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
3381         *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) =                    \
3382             *(TYPE *)(vm + odd_ofs + H(i));                          \
3383     }                                                                \
3384     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                \
3385         memset(vd + oprsz - 16, 0, 16);                              \
3386     }                                                                \
3387 }
3388 
3389 DO_ZIP(sve_zip_b, uint8_t, H1)
3390 DO_ZIP(sve_zip_h, uint16_t, H1_2)
3391 DO_ZIP(sve_zip_s, uint32_t, H1_4)
3392 DO_ZIP(sve_zip_d, uint64_t, H1_8)
3393 DO_ZIP(sve2_zip_q, Int128, )
3394 
3395 #define DO_UZP(NAME, TYPE, H) \
3396 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3397 {                                                                      \
3398     intptr_t oprsz = simd_oprsz(desc);                                 \
3399     intptr_t odd_ofs = simd_data(desc);                                \
3400     intptr_t i, p;                                                     \
3401     ARMVectorReg tmp_m;                                                \
3402     if (unlikely((vm - vd) < (uintptr_t)oprsz)) {                      \
3403         vm = memcpy(&tmp_m, vm, oprsz);                                \
3404     }                                                                  \
3405     i = 0, p = odd_ofs;                                                \
3406     do {                                                               \
3407         *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p));                   \
3408         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3409     } while (p < oprsz);                                               \
3410     p -= oprsz;                                                        \
3411     do {                                                               \
3412         *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p));                   \
3413         i += sizeof(TYPE), p += 2 * sizeof(TYPE);                      \
3414     } while (p < oprsz);                                               \
3415     tcg_debug_assert(i == oprsz);                                      \
3416 }
3417 
3418 DO_UZP(sve_uzp_b, uint8_t, H1)
3419 DO_UZP(sve_uzp_h, uint16_t, H1_2)
3420 DO_UZP(sve_uzp_s, uint32_t, H1_4)
3421 DO_UZP(sve_uzp_d, uint64_t, H1_8)
3422 DO_UZP(sve2_uzp_q, Int128, )
3423 
3424 #define DO_TRN(NAME, TYPE, H) \
3425 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc)         \
3426 {                                                                      \
3427     intptr_t oprsz = simd_oprsz(desc);                                 \
3428     intptr_t odd_ofs = simd_data(desc);                                \
3429     intptr_t i;                                                        \
3430     for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) {                    \
3431         TYPE ae = *(TYPE *)(vn + H(i + odd_ofs));                      \
3432         TYPE be = *(TYPE *)(vm + H(i + odd_ofs));                      \
3433         *(TYPE *)(vd + H(i + 0)) = ae;                                 \
3434         *(TYPE *)(vd + H(i + sizeof(TYPE))) = be;                      \
3435     }                                                                  \
3436     if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) {                  \
3437         memset(vd + oprsz - 16, 0, 16);                                \
3438     }                                                                  \
3439 }
3440 
3441 DO_TRN(sve_trn_b, uint8_t, H1)
3442 DO_TRN(sve_trn_h, uint16_t, H1_2)
3443 DO_TRN(sve_trn_s, uint32_t, H1_4)
3444 DO_TRN(sve_trn_d, uint64_t, H1_8)
3445 DO_TRN(sve2_trn_q, Int128, )
3446 
3447 #undef DO_ZIP
3448 #undef DO_UZP
3449 #undef DO_TRN
3450 
3451 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3452 {
3453     intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3454     uint32_t *d = vd, *n = vn;
3455     uint8_t *pg = vg;
3456 
3457     for (i = j = 0; i < opr_sz; i++) {
3458         if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3459             d[H4(j)] = n[H4(i)];
3460             j++;
3461         }
3462     }
3463     for (; j < opr_sz; j++) {
3464         d[H4(j)] = 0;
3465     }
3466 }
3467 
3468 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3469 {
3470     intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3471     uint64_t *d = vd, *n = vn;
3472     uint8_t *pg = vg;
3473 
3474     for (i = j = 0; i < opr_sz; i++) {
3475         if (pg[H1(i)] & 1) {
3476             d[j] = n[i];
3477             j++;
3478         }
3479     }
3480     for (; j < opr_sz; j++) {
3481         d[j] = 0;
3482     }
3483 }
3484 
3485 /* Similar to the ARM LastActiveElement pseudocode function, except the
3486  * result is multiplied by the element size.  This includes the not found
3487  * indication; e.g. not found for esz=3 is -8.
3488  */
3489 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3490 {
3491     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3492     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3493 
3494     return last_active_element(vg, words, esz);
3495 }
3496 
3497 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3498 {
3499     intptr_t opr_sz = simd_oprsz(desc) / 8;
3500     int esz = simd_data(desc);
3501     uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3502     intptr_t i, first_i, last_i;
3503     ARMVectorReg tmp;
3504 
3505     first_i = last_i = 0;
3506     first_g = last_g = 0;
3507 
3508     /* Find the extent of the active elements within VG.  */
3509     for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3510         pg = *(uint64_t *)(vg + i) & mask;
3511         if (pg) {
3512             if (last_g == 0) {
3513                 last_g = pg;
3514                 last_i = i;
3515             }
3516             first_g = pg;
3517             first_i = i;
3518         }
3519     }
3520 
3521     len = 0;
3522     if (first_g != 0) {
3523         first_i = first_i * 8 + ctz64(first_g);
3524         last_i = last_i * 8 + 63 - clz64(last_g);
3525         len = last_i - first_i + (1 << esz);
3526         if (vd == vm) {
3527             vm = memcpy(&tmp, vm, opr_sz * 8);
3528         }
3529         swap_memmove(vd, vn + first_i, len);
3530     }
3531     swap_memmove(vd + len, vm, opr_sz * 8 - len);
3532 }
3533 
3534 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3535                             void *vg, uint32_t desc)
3536 {
3537     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3538     uint64_t *d = vd, *n = vn, *m = vm;
3539     uint8_t *pg = vg;
3540 
3541     for (i = 0; i < opr_sz; i += 1) {
3542         uint64_t nn = n[i], mm = m[i];
3543         uint64_t pp = expand_pred_b(pg[H1(i)]);
3544         d[i] = (nn & pp) | (mm & ~pp);
3545     }
3546 }
3547 
3548 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3549                             void *vg, uint32_t desc)
3550 {
3551     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3552     uint64_t *d = vd, *n = vn, *m = vm;
3553     uint8_t *pg = vg;
3554 
3555     for (i = 0; i < opr_sz; i += 1) {
3556         uint64_t nn = n[i], mm = m[i];
3557         uint64_t pp = expand_pred_h(pg[H1(i)]);
3558         d[i] = (nn & pp) | (mm & ~pp);
3559     }
3560 }
3561 
3562 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3563                             void *vg, uint32_t desc)
3564 {
3565     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3566     uint64_t *d = vd, *n = vn, *m = vm;
3567     uint8_t *pg = vg;
3568 
3569     for (i = 0; i < opr_sz; i += 1) {
3570         uint64_t nn = n[i], mm = m[i];
3571         uint64_t pp = expand_pred_s(pg[H1(i)]);
3572         d[i] = (nn & pp) | (mm & ~pp);
3573     }
3574 }
3575 
3576 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3577                             void *vg, uint32_t desc)
3578 {
3579     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3580     uint64_t *d = vd, *n = vn, *m = vm;
3581     uint8_t *pg = vg;
3582 
3583     for (i = 0; i < opr_sz; i += 1) {
3584         uint64_t nn = n[i], mm = m[i];
3585         d[i] = (pg[H1(i)] & 1 ? nn : mm);
3586     }
3587 }
3588 
3589 void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
3590                             void *vg, uint32_t desc)
3591 {
3592     intptr_t i, opr_sz = simd_oprsz(desc) / 16;
3593     Int128 *d = vd, *n = vn, *m = vm;
3594     uint16_t *pg = vg;
3595 
3596     for (i = 0; i < opr_sz; i += 1) {
3597         d[i] = (pg[H2(i)] & 1 ? n : m)[i];
3598     }
3599 }
3600 
3601 /* Two operand comparison controlled by a predicate.
3602  * ??? It is very tempting to want to be able to expand this inline
3603  * with x86 instructions, e.g.
3604  *
3605  *    vcmpeqw    zm, zn, %ymm0
3606  *    vpmovmskb  %ymm0, %eax
3607  *    and        $0x5555, %eax
3608  *    and        pg, %eax
3609  *
3610  * or even aarch64, e.g.
3611  *
3612  *    // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3613  *    cmeq       v0.8h, zn, zm
3614  *    and        v0.8h, v0.8h, mask
3615  *    addv       h0, v0.8h
3616  *    and        v0.8b, pg
3617  *
3618  * However, coming up with an abstraction that allows vector inputs and
3619  * a scalar output, and also handles the byte-ordering of sub-uint64_t
3620  * scalar outputs, is tricky.
3621  */
3622 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)                                 \
3623 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3624 {                                                                            \
3625     intptr_t opr_sz = simd_oprsz(desc);                                      \
3626     uint32_t flags = PREDTEST_INIT;                                          \
3627     intptr_t i = opr_sz;                                                     \
3628     do {                                                                     \
3629         uint64_t out = 0, pg;                                                \
3630         do {                                                                 \
3631             i -= sizeof(TYPE), out <<= sizeof(TYPE);                         \
3632             TYPE nn = *(TYPE *)(vn + H(i));                                  \
3633             TYPE mm = *(TYPE *)(vm + H(i));                                  \
3634             out |= nn OP mm;                                                 \
3635         } while (i & 63);                                                    \
3636         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3637         out &= pg;                                                           \
3638         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3639         flags = iter_predtest_bwd(out, pg, flags);                           \
3640     } while (i > 0);                                                         \
3641     return flags;                                                            \
3642 }
3643 
3644 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3645     DO_CMP_PPZZ(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3646 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3647     DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3648 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3649     DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3650 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3651     DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3652 
3653 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t,  ==)
3654 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3655 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3656 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3657 
3658 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t,  !=)
3659 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3660 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3661 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3662 
3663 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t,  >)
3664 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3665 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3666 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3667 
3668 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t,  >=)
3669 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3670 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3671 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3672 
3673 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t,  >)
3674 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3675 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3676 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3677 
3678 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t,  >=)
3679 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3680 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3681 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3682 
3683 #undef DO_CMP_PPZZ_B
3684 #undef DO_CMP_PPZZ_H
3685 #undef DO_CMP_PPZZ_S
3686 #undef DO_CMP_PPZZ_D
3687 #undef DO_CMP_PPZZ
3688 
3689 /* Similar, but the second source is "wide".  */
3690 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK)                     \
3691 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3692 {                                                                            \
3693     intptr_t opr_sz = simd_oprsz(desc);                                      \
3694     uint32_t flags = PREDTEST_INIT;                                          \
3695     intptr_t i = opr_sz;                                                     \
3696     do {                                                                     \
3697         uint64_t out = 0, pg;                                                \
3698         do {                                                                 \
3699             TYPEW mm = *(TYPEW *)(vm + i - 8);                               \
3700             do {                                                             \
3701                 i -= sizeof(TYPE), out <<= sizeof(TYPE);                     \
3702                 TYPE nn = *(TYPE *)(vn + H(i));                              \
3703                 out |= nn OP mm;                                             \
3704             } while (i & 7);                                                 \
3705         } while (i & 63);                                                    \
3706         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                            \
3707         out &= pg;                                                           \
3708         *(uint64_t *)(vd + (i >> 3)) = out;                                  \
3709         flags = iter_predtest_bwd(out, pg, flags);                           \
3710     } while (i > 0);                                                         \
3711     return flags;                                                            \
3712 }
3713 
3714 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3715     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1,   0xffffffffffffffffull)
3716 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3717     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3718 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3719     DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3720 
3721 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t,  uint64_t, ==)
3722 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3723 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
3724 
3725 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t,  uint64_t, !=)
3726 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3727 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
3728 
3729 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t,   int64_t, >)
3730 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t,  int64_t, >)
3731 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t,  int64_t, >)
3732 
3733 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t,   int64_t, >=)
3734 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t,  int64_t, >=)
3735 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t,  int64_t, >=)
3736 
3737 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t,  uint64_t, >)
3738 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3739 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3740 
3741 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t,  uint64_t, >=)
3742 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3743 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3744 
3745 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t,   int64_t, <)
3746 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t,  int64_t, <)
3747 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t,  int64_t, <)
3748 
3749 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t,   int64_t, <=)
3750 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t,  int64_t, <=)
3751 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t,  int64_t, <=)
3752 
3753 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t,  uint64_t, <)
3754 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3755 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3756 
3757 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t,  uint64_t, <=)
3758 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3759 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3760 
3761 #undef DO_CMP_PPZW_B
3762 #undef DO_CMP_PPZW_H
3763 #undef DO_CMP_PPZW_S
3764 #undef DO_CMP_PPZW
3765 
3766 /* Similar, but the second source is immediate.  */
3767 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK)                         \
3768 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc)   \
3769 {                                                                    \
3770     intptr_t opr_sz = simd_oprsz(desc);                              \
3771     uint32_t flags = PREDTEST_INIT;                                  \
3772     TYPE mm = simd_data(desc);                                       \
3773     intptr_t i = opr_sz;                                             \
3774     do {                                                             \
3775         uint64_t out = 0, pg;                                        \
3776         do {                                                         \
3777             i -= sizeof(TYPE), out <<= sizeof(TYPE);                 \
3778             TYPE nn = *(TYPE *)(vn + H(i));                          \
3779             out |= nn OP mm;                                         \
3780         } while (i & 63);                                            \
3781         pg = *(uint64_t *)(vg + (i >> 3)) & MASK;                    \
3782         out &= pg;                                                   \
3783         *(uint64_t *)(vd + (i >> 3)) = out;                          \
3784         flags = iter_predtest_bwd(out, pg, flags);                   \
3785     } while (i > 0);                                                 \
3786     return flags;                                                    \
3787 }
3788 
3789 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3790     DO_CMP_PPZI(NAME, TYPE, OP, H1,   0xffffffffffffffffull)
3791 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3792     DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3793 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3794     DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3795 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3796     DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
3797 
3798 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t,  ==)
3799 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3800 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3801 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3802 
3803 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t,  !=)
3804 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3805 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3806 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3807 
3808 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t,  >)
3809 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3810 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3811 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3812 
3813 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t,  >=)
3814 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3815 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3816 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3817 
3818 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t,  >)
3819 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3820 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3821 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3822 
3823 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t,  >=)
3824 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3825 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3826 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3827 
3828 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t,  <)
3829 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3830 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3831 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3832 
3833 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t,  <=)
3834 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3835 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3836 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3837 
3838 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t,  <)
3839 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3840 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3841 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3842 
3843 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t,  <=)
3844 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3845 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3846 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3847 
3848 #undef DO_CMP_PPZI_B
3849 #undef DO_CMP_PPZI_H
3850 #undef DO_CMP_PPZI_S
3851 #undef DO_CMP_PPZI_D
3852 #undef DO_CMP_PPZI
3853 
3854 /* Similar to the ARM LastActive pseudocode function.  */
3855 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3856 {
3857     intptr_t i;
3858 
3859     for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3860         uint64_t pg = *(uint64_t *)(vg + i);
3861         if (pg) {
3862             return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3863         }
3864     }
3865     return 0;
3866 }
3867 
3868 /* Compute a mask into RETB that is true for all G, up to and including
3869  * (if after) or excluding (if !after) the first G & N.
3870  * Return true if BRK found.
3871  */
3872 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3873                         bool brk, bool after)
3874 {
3875     uint64_t b;
3876 
3877     if (brk) {
3878         b = 0;
3879     } else if ((g & n) == 0) {
3880         /* For all G, no N are set; break not found.  */
3881         b = g;
3882     } else {
3883         /* Break somewhere in N.  Locate it.  */
3884         b = g & n;            /* guard true, pred true */
3885         b = b & -b;           /* first such */
3886         if (after) {
3887             b = b | (b - 1);  /* break after same */
3888         } else {
3889             b = b - 1;        /* break before same */
3890         }
3891         brk = true;
3892     }
3893 
3894     *retb = b;
3895     return brk;
3896 }
3897 
3898 /* Compute a zeroing BRK.  */
3899 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3900                           intptr_t oprsz, bool after)
3901 {
3902     bool brk = false;
3903     intptr_t i;
3904 
3905     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3906         uint64_t this_b, this_g = g[i];
3907 
3908         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3909         d[i] = this_b & this_g;
3910     }
3911 }
3912 
3913 /* Likewise, but also compute flags.  */
3914 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3915                                intptr_t oprsz, bool after)
3916 {
3917     uint32_t flags = PREDTEST_INIT;
3918     bool brk = false;
3919     intptr_t i;
3920 
3921     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3922         uint64_t this_b, this_d, this_g = g[i];
3923 
3924         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3925         d[i] = this_d = this_b & this_g;
3926         flags = iter_predtest_fwd(this_d, this_g, flags);
3927     }
3928     return flags;
3929 }
3930 
3931 /* Compute a merging BRK.  */
3932 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3933                           intptr_t oprsz, bool after)
3934 {
3935     bool brk = false;
3936     intptr_t i;
3937 
3938     for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3939         uint64_t this_b, this_g = g[i];
3940 
3941         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3942         d[i] = (this_b & this_g) | (d[i] & ~this_g);
3943     }
3944 }
3945 
3946 /* Likewise, but also compute flags.  */
3947 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3948                                intptr_t oprsz, bool after)
3949 {
3950     uint32_t flags = PREDTEST_INIT;
3951     bool brk = false;
3952     intptr_t i;
3953 
3954     for (i = 0; i < oprsz / 8; ++i) {
3955         uint64_t this_b, this_d = d[i], this_g = g[i];
3956 
3957         brk = compute_brk(&this_b, n[i], this_g, brk, after);
3958         d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3959         flags = iter_predtest_fwd(this_d, this_g, flags);
3960     }
3961     return flags;
3962 }
3963 
3964 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3965 {
3966     /* It is quicker to zero the whole predicate than loop on OPRSZ.
3967      * The compiler should turn this into 4 64-bit integer stores.
3968      */
3969     memset(d, 0, sizeof(ARMPredicateReg));
3970     return PREDTEST_INIT;
3971 }
3972 
3973 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3974                        uint32_t pred_desc)
3975 {
3976     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3977     if (last_active_pred(vn, vg, oprsz)) {
3978         compute_brk_z(vd, vm, vg, oprsz, true);
3979     } else {
3980         do_zero(vd, oprsz);
3981     }
3982 }
3983 
3984 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3985                             uint32_t pred_desc)
3986 {
3987     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3988     if (last_active_pred(vn, vg, oprsz)) {
3989         return compute_brks_z(vd, vm, vg, oprsz, true);
3990     } else {
3991         return do_zero(vd, oprsz);
3992     }
3993 }
3994 
3995 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3996                        uint32_t pred_desc)
3997 {
3998     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3999     if (last_active_pred(vn, vg, oprsz)) {
4000         compute_brk_z(vd, vm, vg, oprsz, false);
4001     } else {
4002         do_zero(vd, oprsz);
4003     }
4004 }
4005 
4006 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
4007                             uint32_t pred_desc)
4008 {
4009     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4010     if (last_active_pred(vn, vg, oprsz)) {
4011         return compute_brks_z(vd, vm, vg, oprsz, false);
4012     } else {
4013         return do_zero(vd, oprsz);
4014     }
4015 }
4016 
4017 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4018 {
4019     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4020     compute_brk_z(vd, vn, vg, oprsz, true);
4021 }
4022 
4023 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4024 {
4025     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4026     return compute_brks_z(vd, vn, vg, oprsz, true);
4027 }
4028 
4029 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4030 {
4031     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4032     compute_brk_z(vd, vn, vg, oprsz, false);
4033 }
4034 
4035 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4036 {
4037     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4038     return compute_brks_z(vd, vn, vg, oprsz, false);
4039 }
4040 
4041 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4042 {
4043     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4044     compute_brk_m(vd, vn, vg, oprsz, true);
4045 }
4046 
4047 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4048 {
4049     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4050     return compute_brks_m(vd, vn, vg, oprsz, true);
4051 }
4052 
4053 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4054 {
4055     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4056     compute_brk_m(vd, vn, vg, oprsz, false);
4057 }
4058 
4059 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4060 {
4061     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4062     return compute_brks_m(vd, vn, vg, oprsz, false);
4063 }
4064 
4065 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4066 {
4067     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4068     if (!last_active_pred(vn, vg, oprsz)) {
4069         do_zero(vd, oprsz);
4070     }
4071 }
4072 
4073 /* As if PredTest(Ones(PL), D, esz).  */
4074 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
4075                               uint64_t esz_mask)
4076 {
4077     uint32_t flags = PREDTEST_INIT;
4078     intptr_t i;
4079 
4080     for (i = 0; i < oprsz / 8; i++) {
4081         flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
4082     }
4083     if (oprsz & 7) {
4084         uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
4085         flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
4086     }
4087     return flags;
4088 }
4089 
4090 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
4091 {
4092     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4093     if (last_active_pred(vn, vg, oprsz)) {
4094         return predtest_ones(vd, oprsz, -1);
4095     } else {
4096         return do_zero(vd, oprsz);
4097     }
4098 }
4099 
4100 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
4101 {
4102     intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
4103     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4104     uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
4105     intptr_t i;
4106 
4107     for (i = 0; i < words; ++i) {
4108         uint64_t t = n[i] & g[i] & mask;
4109         sum += ctpop64(t);
4110     }
4111     return sum;
4112 }
4113 
4114 uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
4115 {
4116     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4117     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4118     uint64_t esz_mask = pred_esz_masks[esz];
4119     ARMPredicateReg *d = vd;
4120     uint32_t flags;
4121     intptr_t i;
4122 
4123     /* Begin with a zero predicate register.  */
4124     flags = do_zero(d, oprsz);
4125     if (count == 0) {
4126         return flags;
4127     }
4128 
4129     /* Set all of the requested bits.  */
4130     for (i = 0; i < count / 64; ++i) {
4131         d->p[i] = esz_mask;
4132     }
4133     if (count & 63) {
4134         d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
4135     }
4136 
4137     return predtest_ones(d, oprsz, esz_mask);
4138 }
4139 
4140 uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
4141 {
4142     intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
4143     intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
4144     uint64_t esz_mask = pred_esz_masks[esz];
4145     ARMPredicateReg *d = vd;
4146     intptr_t i, invcount, oprbits;
4147     uint64_t bits;
4148 
4149     if (count == 0) {
4150         return do_zero(d, oprsz);
4151     }
4152 
4153     oprbits = oprsz * 8;
4154     tcg_debug_assert(count <= oprbits);
4155 
4156     bits = esz_mask;
4157     if (oprbits & 63) {
4158         bits &= MAKE_64BIT_MASK(0, oprbits & 63);
4159     }
4160 
4161     invcount = oprbits - count;
4162     for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
4163         d->p[i] = bits;
4164         bits = esz_mask;
4165     }
4166 
4167     d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
4168 
4169     while (--i >= 0) {
4170         d->p[i] = 0;
4171     }
4172 
4173     return predtest_ones(d, oprsz, esz_mask);
4174 }
4175 
4176 /* Recursive reduction on a function;
4177  * C.f. the ARM ARM function ReducePredicated.
4178  *
4179  * While it would be possible to write this without the DATA temporary,
4180  * it is much simpler to process the predicate register this way.
4181  * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4182  * little to gain with a more complex non-recursive form.
4183  */
4184 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT)                         \
4185 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4186 {                                                                     \
4187     if (n == 1) {                                                     \
4188         return *data;                                                 \
4189     } else {                                                          \
4190         uintptr_t half = n / 2;                                       \
4191         TYPE lo = NAME##_reduce(data, status, half);                  \
4192         TYPE hi = NAME##_reduce(data + half, status, half);           \
4193         return TYPE##_##FUNC(lo, hi, status);                         \
4194     }                                                                 \
4195 }                                                                     \
4196 uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \
4197 {                                                                     \
4198     uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc);   \
4199     TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)];                   \
4200     for (i = 0; i < oprsz; ) {                                        \
4201         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));               \
4202         do {                                                          \
4203             TYPE nn = *(TYPE *)(vn + H(i));                           \
4204             *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT);      \
4205             i += sizeof(TYPE), pg >>= sizeof(TYPE);                   \
4206         } while (i & 15);                                             \
4207     }                                                                 \
4208     for (; i < maxsz; i += sizeof(TYPE)) {                            \
4209         *(TYPE *)((void *)data + i) = IDENT;                          \
4210     }                                                                 \
4211     return NAME##_reduce(data, s, maxsz / sizeof(TYPE));              \
4212 }
4213 
4214 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
4215 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
4216 DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
4217 
4218 /* Identity is floatN_default_nan, without the function call.  */
4219 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
4220 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4221 DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
4222 
4223 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4224 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4225 DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
4226 
4227 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4228 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4229 DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
4230 
4231 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4232 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4233 DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
4234 
4235 #undef DO_REDUCE
4236 
4237 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4238                              float_status *status, uint32_t desc)
4239 {
4240     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4241     float16 result = nn;
4242 
4243     do {
4244         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4245         do {
4246             if (pg & 1) {
4247                 float16 mm = *(float16 *)(vm + H1_2(i));
4248                 result = float16_add(result, mm, status);
4249             }
4250             i += sizeof(float16), pg >>= sizeof(float16);
4251         } while (i & 15);
4252     } while (i < opr_sz);
4253 
4254     return result;
4255 }
4256 
4257 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4258                              float_status *status, uint32_t desc)
4259 {
4260     intptr_t i = 0, opr_sz = simd_oprsz(desc);
4261     float32 result = nn;
4262 
4263     do {
4264         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4265         do {
4266             if (pg & 1) {
4267                 float32 mm = *(float32 *)(vm + H1_2(i));
4268                 result = float32_add(result, mm, status);
4269             }
4270             i += sizeof(float32), pg >>= sizeof(float32);
4271         } while (i & 15);
4272     } while (i < opr_sz);
4273 
4274     return result;
4275 }
4276 
4277 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4278                              float_status *status, uint32_t desc)
4279 {
4280     intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4281     uint64_t *m = vm;
4282     uint8_t *pg = vg;
4283 
4284     for (i = 0; i < opr_sz; i++) {
4285         if (pg[H1(i)] & 1) {
4286             nn = float64_add(nn, m[i], status);
4287         }
4288     }
4289 
4290     return nn;
4291 }
4292 
4293 /* Fully general three-operand expander, controlled by a predicate,
4294  * With the extra float_status parameter.
4295  */
4296 #define DO_ZPZZ_FP(NAME, TYPE, H, OP)                           \
4297 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,       \
4298                   float_status *status, uint32_t desc)          \
4299 {                                                               \
4300     intptr_t i = simd_oprsz(desc);                              \
4301     uint64_t *g = vg;                                           \
4302     do {                                                        \
4303         uint64_t pg = g[(i - 1) >> 6];                          \
4304         do {                                                    \
4305             i -= sizeof(TYPE);                                  \
4306             if (likely((pg >> (i & 63)) & 1)) {                 \
4307                 TYPE nn = *(TYPE *)(vn + H(i));                 \
4308                 TYPE mm = *(TYPE *)(vm + H(i));                 \
4309                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);      \
4310             }                                                   \
4311         } while (i & 63);                                       \
4312     } while (i != 0);                                           \
4313 }
4314 
4315 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4316 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4317 DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
4318 
4319 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4320 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4321 DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
4322 
4323 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4324 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4325 DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
4326 
4327 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4328 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4329 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
4330 
4331 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4332 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4333 DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
4334 
4335 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4336 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4337 DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
4338 
4339 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4340 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4341 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
4342 
4343 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4344 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4345 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
4346 
4347 static inline float16 abd_h(float16 a, float16 b, float_status *s)
4348 {
4349     return float16_abs(float16_sub(a, b, s));
4350 }
4351 
4352 static inline float32 abd_s(float32 a, float32 b, float_status *s)
4353 {
4354     return float32_abs(float32_sub(a, b, s));
4355 }
4356 
4357 static inline float64 abd_d(float64 a, float64 b, float_status *s)
4358 {
4359     return float64_abs(float64_sub(a, b, s));
4360 }
4361 
4362 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4363 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4364 DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
4365 
4366 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4367 {
4368     int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4369     return float64_scalbn(a, b_int, s);
4370 }
4371 
4372 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4373 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4374 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
4375 
4376 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4377 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4378 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
4379 
4380 #undef DO_ZPZZ_FP
4381 
4382 /* Three-operand expander, with one scalar operand, controlled by
4383  * a predicate, with the extra float_status parameter.
4384  */
4385 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4386 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar,  \
4387                   float_status *status, uint32_t desc)            \
4388 {                                                                 \
4389     intptr_t i = simd_oprsz(desc);                                \
4390     uint64_t *g = vg;                                             \
4391     TYPE mm = scalar;                                             \
4392     do {                                                          \
4393         uint64_t pg = g[(i - 1) >> 6];                            \
4394         do {                                                      \
4395             i -= sizeof(TYPE);                                    \
4396             if (likely((pg >> (i & 63)) & 1)) {                   \
4397                 TYPE nn = *(TYPE *)(vn + H(i));                   \
4398                 *(TYPE *)(vd + H(i)) = OP(nn, mm, status);        \
4399             }                                                     \
4400         } while (i & 63);                                         \
4401     } while (i != 0);                                             \
4402 }
4403 
4404 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4405 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4406 DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
4407 
4408 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4409 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4410 DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
4411 
4412 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4413 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4414 DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
4415 
4416 static inline float16 subr_h(float16 a, float16 b, float_status *s)
4417 {
4418     return float16_sub(b, a, s);
4419 }
4420 
4421 static inline float32 subr_s(float32 a, float32 b, float_status *s)
4422 {
4423     return float32_sub(b, a, s);
4424 }
4425 
4426 static inline float64 subr_d(float64 a, float64 b, float_status *s)
4427 {
4428     return float64_sub(b, a, s);
4429 }
4430 
4431 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4432 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4433 DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
4434 
4435 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4436 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4437 DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
4438 
4439 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4440 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4441 DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
4442 
4443 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4444 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4445 DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
4446 
4447 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4448 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4449 DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
4450 
4451 /* Fully general two-operand expander, controlled by a predicate,
4452  * With the extra float_status parameter.
4453  */
4454 #define DO_ZPZ_FP(NAME, TYPE, H, OP)                                  \
4455 void HELPER(NAME)(void *vd, void *vn, void *vg,                       \
4456                   float_status *status, uint32_t desc)                \
4457 {                                                                     \
4458     intptr_t i = simd_oprsz(desc);                                    \
4459     uint64_t *g = vg;                                                 \
4460     do {                                                              \
4461         uint64_t pg = g[(i - 1) >> 6];                                \
4462         do {                                                          \
4463             i -= sizeof(TYPE);                                        \
4464             if (likely((pg >> (i & 63)) & 1)) {                       \
4465                 TYPE nn = *(TYPE *)(vn + H(i));                       \
4466                 *(TYPE *)(vd + H(i)) = OP(nn, status);                \
4467             }                                                         \
4468         } while (i & 63);                                             \
4469     } while (i != 0);                                                 \
4470 }
4471 
4472 /* SVE fp16 conversions always use IEEE mode.  Like AdvSIMD, they ignore
4473  * FZ16.  When converting from fp16, this affects flushing input denormals;
4474  * when converting to fp16, this affects flushing output denormals.
4475  */
4476 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4477 {
4478     bool save = get_flush_inputs_to_zero(fpst);
4479     float32 ret;
4480 
4481     set_flush_inputs_to_zero(false, fpst);
4482     ret = float16_to_float32(f, true, fpst);
4483     set_flush_inputs_to_zero(save, fpst);
4484     return ret;
4485 }
4486 
4487 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4488 {
4489     bool save = get_flush_inputs_to_zero(fpst);
4490     float64 ret;
4491 
4492     set_flush_inputs_to_zero(false, fpst);
4493     ret = float16_to_float64(f, true, fpst);
4494     set_flush_inputs_to_zero(save, fpst);
4495     return ret;
4496 }
4497 
4498 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4499 {
4500     bool save = get_flush_to_zero(fpst);
4501     float16 ret;
4502 
4503     set_flush_to_zero(false, fpst);
4504     ret = float32_to_float16(f, true, fpst);
4505     set_flush_to_zero(save, fpst);
4506     return ret;
4507 }
4508 
4509 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4510 {
4511     bool save = get_flush_to_zero(fpst);
4512     float16 ret;
4513 
4514     set_flush_to_zero(false, fpst);
4515     ret = float64_to_float16(f, true, fpst);
4516     set_flush_to_zero(save, fpst);
4517     return ret;
4518 }
4519 
4520 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4521 {
4522     if (float16_is_any_nan(f)) {
4523         float_raise(float_flag_invalid, s);
4524         return 0;
4525     }
4526     return float16_to_int16_round_to_zero(f, s);
4527 }
4528 
4529 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4530 {
4531     if (float16_is_any_nan(f)) {
4532         float_raise(float_flag_invalid, s);
4533         return 0;
4534     }
4535     return float16_to_int64_round_to_zero(f, s);
4536 }
4537 
4538 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4539 {
4540     if (float32_is_any_nan(f)) {
4541         float_raise(float_flag_invalid, s);
4542         return 0;
4543     }
4544     return float32_to_int64_round_to_zero(f, s);
4545 }
4546 
4547 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4548 {
4549     if (float64_is_any_nan(f)) {
4550         float_raise(float_flag_invalid, s);
4551         return 0;
4552     }
4553     return float64_to_int64_round_to_zero(f, s);
4554 }
4555 
4556 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4557 {
4558     if (float16_is_any_nan(f)) {
4559         float_raise(float_flag_invalid, s);
4560         return 0;
4561     }
4562     return float16_to_uint16_round_to_zero(f, s);
4563 }
4564 
4565 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4566 {
4567     if (float16_is_any_nan(f)) {
4568         float_raise(float_flag_invalid, s);
4569         return 0;
4570     }
4571     return float16_to_uint64_round_to_zero(f, s);
4572 }
4573 
4574 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4575 {
4576     if (float32_is_any_nan(f)) {
4577         float_raise(float_flag_invalid, s);
4578         return 0;
4579     }
4580     return float32_to_uint64_round_to_zero(f, s);
4581 }
4582 
4583 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4584 {
4585     if (float64_is_any_nan(f)) {
4586         float_raise(float_flag_invalid, s);
4587         return 0;
4588     }
4589     return float64_to_uint64_round_to_zero(f, s);
4590 }
4591 
4592 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4593 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4594 DO_ZPZ_FP(sve_bfcvt,   uint32_t, H1_4, float32_to_bfloat16)
4595 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
4596 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
4597 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
4598 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
4599 
4600 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4601 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4602 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4603 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
4604 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
4605 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
4606 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
4607 
4608 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4609 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4610 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4611 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
4612 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
4613 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
4614 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
4615 
4616 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4617 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4618 DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
4619 
4620 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4621 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4622 DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
4623 
4624 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4625 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4626 DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
4627 
4628 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4629 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4630 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
4631 
4632 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4633 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4634 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4635 DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
4636 DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
4637 DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
4638 DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
4639 
4640 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4641 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4642 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4643 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
4644 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
4645 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
4646 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
4647 
4648 static int16_t do_float16_logb_as_int(float16 a, float_status *s)
4649 {
4650     /* Extract frac to the top of the uint32_t. */
4651     uint32_t frac = (uint32_t)a << (16 + 6);
4652     int16_t exp = extract32(a, 10, 5);
4653 
4654     if (unlikely(exp == 0)) {
4655         if (frac != 0) {
4656             if (!get_flush_inputs_to_zero(s)) {
4657                 /* denormal: bias - fractional_zeros */
4658                 return -15 - clz32(frac);
4659             }
4660             /* flush to zero */
4661             float_raise(float_flag_input_denormal, s);
4662         }
4663     } else if (unlikely(exp == 0x1f)) {
4664         if (frac == 0) {
4665             return INT16_MAX; /* infinity */
4666         }
4667     } else {
4668         /* normal: exp - bias */
4669         return exp - 15;
4670     }
4671     /* nan or zero */
4672     float_raise(float_flag_invalid, s);
4673     return INT16_MIN;
4674 }
4675 
4676 static int32_t do_float32_logb_as_int(float32 a, float_status *s)
4677 {
4678     /* Extract frac to the top of the uint32_t. */
4679     uint32_t frac = a << 9;
4680     int32_t exp = extract32(a, 23, 8);
4681 
4682     if (unlikely(exp == 0)) {
4683         if (frac != 0) {
4684             if (!get_flush_inputs_to_zero(s)) {
4685                 /* denormal: bias - fractional_zeros */
4686                 return -127 - clz32(frac);
4687             }
4688             /* flush to zero */
4689             float_raise(float_flag_input_denormal, s);
4690         }
4691     } else if (unlikely(exp == 0xff)) {
4692         if (frac == 0) {
4693             return INT32_MAX; /* infinity */
4694         }
4695     } else {
4696         /* normal: exp - bias */
4697         return exp - 127;
4698     }
4699     /* nan or zero */
4700     float_raise(float_flag_invalid, s);
4701     return INT32_MIN;
4702 }
4703 
4704 static int64_t do_float64_logb_as_int(float64 a, float_status *s)
4705 {
4706     /* Extract frac to the top of the uint64_t. */
4707     uint64_t frac = a << 12;
4708     int64_t exp = extract64(a, 52, 11);
4709 
4710     if (unlikely(exp == 0)) {
4711         if (frac != 0) {
4712             if (!get_flush_inputs_to_zero(s)) {
4713                 /* denormal: bias - fractional_zeros */
4714                 return -1023 - clz64(frac);
4715             }
4716             /* flush to zero */
4717             float_raise(float_flag_input_denormal, s);
4718         }
4719     } else if (unlikely(exp == 0x7ff)) {
4720         if (frac == 0) {
4721             return INT64_MAX; /* infinity */
4722         }
4723     } else {
4724         /* normal: exp - bias */
4725         return exp - 1023;
4726     }
4727     /* nan or zero */
4728     float_raise(float_flag_invalid, s);
4729     return INT64_MIN;
4730 }
4731 
4732 DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
4733 DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
4734 DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
4735 
4736 #undef DO_ZPZ_FP
4737 
4738 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4739                             float_status *status, uint32_t desc,
4740                             uint16_t neg1, uint16_t neg3)
4741 {
4742     intptr_t i = simd_oprsz(desc);
4743     uint64_t *g = vg;
4744 
4745     do {
4746         uint64_t pg = g[(i - 1) >> 6];
4747         do {
4748             i -= 2;
4749             if (likely((pg >> (i & 63)) & 1)) {
4750                 float16 e1, e2, e3, r;
4751 
4752                 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4753                 e2 = *(uint16_t *)(vm + H1_2(i));
4754                 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
4755                 r = float16_muladd(e1, e2, e3, 0, status);
4756                 *(uint16_t *)(vd + H1_2(i)) = r;
4757             }
4758         } while (i & 63);
4759     } while (i != 0);
4760 }
4761 
4762 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4763                               void *vg, float_status *status, uint32_t desc)
4764 {
4765     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
4766 }
4767 
4768 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4769                               void *vg, float_status *status, uint32_t desc)
4770 {
4771     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
4772 }
4773 
4774 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4775                                void *vg, float_status *status, uint32_t desc)
4776 {
4777     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
4778 }
4779 
4780 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4781                                void *vg, float_status *status, uint32_t desc)
4782 {
4783     do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
4784 }
4785 
4786 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4787                             float_status *status, uint32_t desc,
4788                             uint32_t neg1, uint32_t neg3)
4789 {
4790     intptr_t i = simd_oprsz(desc);
4791     uint64_t *g = vg;
4792 
4793     do {
4794         uint64_t pg = g[(i - 1) >> 6];
4795         do {
4796             i -= 4;
4797             if (likely((pg >> (i & 63)) & 1)) {
4798                 float32 e1, e2, e3, r;
4799 
4800                 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4801                 e2 = *(uint32_t *)(vm + H1_4(i));
4802                 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
4803                 r = float32_muladd(e1, e2, e3, 0, status);
4804                 *(uint32_t *)(vd + H1_4(i)) = r;
4805             }
4806         } while (i & 63);
4807     } while (i != 0);
4808 }
4809 
4810 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4811                               void *vg, float_status *status, uint32_t desc)
4812 {
4813     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
4814 }
4815 
4816 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4817                               void *vg, float_status *status, uint32_t desc)
4818 {
4819     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
4820 }
4821 
4822 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4823                                void *vg, float_status *status, uint32_t desc)
4824 {
4825     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
4826 }
4827 
4828 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4829                                void *vg, float_status *status, uint32_t desc)
4830 {
4831     do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
4832 }
4833 
4834 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4835                             float_status *status, uint32_t desc,
4836                             uint64_t neg1, uint64_t neg3)
4837 {
4838     intptr_t i = simd_oprsz(desc);
4839     uint64_t *g = vg;
4840 
4841     do {
4842         uint64_t pg = g[(i - 1) >> 6];
4843         do {
4844             i -= 8;
4845             if (likely((pg >> (i & 63)) & 1)) {
4846                 float64 e1, e2, e3, r;
4847 
4848                 e1 = *(uint64_t *)(vn + i) ^ neg1;
4849                 e2 = *(uint64_t *)(vm + i);
4850                 e3 = *(uint64_t *)(va + i) ^ neg3;
4851                 r = float64_muladd(e1, e2, e3, 0, status);
4852                 *(uint64_t *)(vd + i) = r;
4853             }
4854         } while (i & 63);
4855     } while (i != 0);
4856 }
4857 
4858 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4859                               void *vg, float_status *status, uint32_t desc)
4860 {
4861     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
4862 }
4863 
4864 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4865                               void *vg, float_status *status, uint32_t desc)
4866 {
4867     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
4868 }
4869 
4870 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4871                                void *vg, float_status *status, uint32_t desc)
4872 {
4873     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
4874 }
4875 
4876 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4877                                void *vg, float_status *status, uint32_t desc)
4878 {
4879     do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
4880 }
4881 
4882 /* Two operand floating-point comparison controlled by a predicate.
4883  * Unlike the integer version, we are not allowed to optimistically
4884  * compare operands, since the comparison may have side effects wrt
4885  * the FPSR.
4886  */
4887 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP)                                \
4888 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg,               \
4889                   float_status *status, uint32_t desc)                  \
4890 {                                                                       \
4891     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;                    \
4892     uint64_t *d = vd, *g = vg;                                          \
4893     do {                                                                \
4894         uint64_t out = 0, pg = g[j];                                    \
4895         do {                                                            \
4896             i -= sizeof(TYPE), out <<= sizeof(TYPE);                    \
4897             if (likely((pg >> (i & 63)) & 1)) {                         \
4898                 TYPE nn = *(TYPE *)(vn + H(i));                         \
4899                 TYPE mm = *(TYPE *)(vm + H(i));                         \
4900                 out |= OP(TYPE, nn, mm, status);                        \
4901             }                                                           \
4902         } while (i & 63);                                               \
4903         d[j--] = out;                                                   \
4904     } while (i > 0);                                                    \
4905 }
4906 
4907 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4908     DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4909 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4910     DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4911 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4912     DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
4913 
4914 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4915     DO_FPCMP_PPZZ_H(NAME, OP)   \
4916     DO_FPCMP_PPZZ_S(NAME, OP)   \
4917     DO_FPCMP_PPZZ_D(NAME, OP)
4918 
4919 #define DO_FCMGE(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) <= 0
4920 #define DO_FCMGT(TYPE, X, Y, ST)  TYPE##_compare(Y, X, ST) < 0
4921 #define DO_FCMLE(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) <= 0
4922 #define DO_FCMLT(TYPE, X, Y, ST)  TYPE##_compare(X, Y, ST) < 0
4923 #define DO_FCMEQ(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) == 0
4924 #define DO_FCMNE(TYPE, X, Y, ST)  TYPE##_compare_quiet(X, Y, ST) != 0
4925 #define DO_FCMUO(TYPE, X, Y, ST)  \
4926     TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4927 #define DO_FACGE(TYPE, X, Y, ST)  \
4928     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4929 #define DO_FACGT(TYPE, X, Y, ST)  \
4930     TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4931 
4932 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4933 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4934 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4935 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4936 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4937 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4938 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4939 
4940 #undef DO_FPCMP_PPZZ_ALL
4941 #undef DO_FPCMP_PPZZ_D
4942 #undef DO_FPCMP_PPZZ_S
4943 #undef DO_FPCMP_PPZZ_H
4944 #undef DO_FPCMP_PPZZ
4945 
4946 /* One operand floating-point comparison against zero, controlled
4947  * by a predicate.
4948  */
4949 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP)                   \
4950 void HELPER(NAME)(void *vd, void *vn, void *vg,            \
4951                   float_status *status, uint32_t desc)     \
4952 {                                                          \
4953     intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6;       \
4954     uint64_t *d = vd, *g = vg;                             \
4955     do {                                                   \
4956         uint64_t out = 0, pg = g[j];                       \
4957         do {                                               \
4958             i -= sizeof(TYPE), out <<= sizeof(TYPE);       \
4959             if ((pg >> (i & 63)) & 1) {                    \
4960                 TYPE nn = *(TYPE *)(vn + H(i));            \
4961                 out |= OP(TYPE, nn, 0, status);            \
4962             }                                              \
4963         } while (i & 63);                                  \
4964         d[j--] = out;                                      \
4965     } while (i > 0);                                       \
4966 }
4967 
4968 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4969     DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4970 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4971     DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4972 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4973     DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
4974 
4975 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4976     DO_FPCMP_PPZ0_H(NAME, OP)   \
4977     DO_FPCMP_PPZ0_S(NAME, OP)   \
4978     DO_FPCMP_PPZ0_D(NAME, OP)
4979 
4980 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4981 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4982 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4983 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4984 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4985 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4986 
4987 /* FP Trig Multiply-Add. */
4988 
4989 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm,
4990                          float_status *s, uint32_t desc)
4991 {
4992     static const float16 coeff[16] = {
4993         0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4994         0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4995     };
4996     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4997     intptr_t x = simd_data(desc);
4998     float16 *d = vd, *n = vn, *m = vm;
4999     for (i = 0; i < opr_sz; i++) {
5000         float16 mm = m[i];
5001         intptr_t xx = x;
5002         if (float16_is_neg(mm)) {
5003             mm = float16_abs(mm);
5004             xx += 8;
5005         }
5006         d[i] = float16_muladd(n[i], mm, coeff[xx], 0, s);
5007     }
5008 }
5009 
5010 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm,
5011                          float_status *s, uint32_t desc)
5012 {
5013     static const float32 coeff[16] = {
5014         0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5015         0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5016         0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5017         0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5018     };
5019     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
5020     intptr_t x = simd_data(desc);
5021     float32 *d = vd, *n = vn, *m = vm;
5022     for (i = 0; i < opr_sz; i++) {
5023         float32 mm = m[i];
5024         intptr_t xx = x;
5025         if (float32_is_neg(mm)) {
5026             mm = float32_abs(mm);
5027             xx += 8;
5028         }
5029         d[i] = float32_muladd(n[i], mm, coeff[xx], 0, s);
5030     }
5031 }
5032 
5033 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm,
5034                          float_status *s, uint32_t desc)
5035 {
5036     static const float64 coeff[16] = {
5037         0x3ff0000000000000ull, 0xbfc5555555555543ull,
5038         0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
5039         0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
5040         0x3de5d8408868552full, 0x0000000000000000ull,
5041         0x3ff0000000000000ull, 0xbfe0000000000000ull,
5042         0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
5043         0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
5044         0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
5045     };
5046     intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
5047     intptr_t x = simd_data(desc);
5048     float64 *d = vd, *n = vn, *m = vm;
5049     for (i = 0; i < opr_sz; i++) {
5050         float64 mm = m[i];
5051         intptr_t xx = x;
5052         if (float64_is_neg(mm)) {
5053             mm = float64_abs(mm);
5054             xx += 8;
5055         }
5056         d[i] = float64_muladd(n[i], mm, coeff[xx], 0, s);
5057     }
5058 }
5059 
5060 /*
5061  * FP Complex Add
5062  */
5063 
5064 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
5065                          float_status *s, uint32_t desc)
5066 {
5067     intptr_t j, i = simd_oprsz(desc);
5068     uint64_t *g = vg;
5069     float16 neg_imag = float16_set_sign(0, simd_data(desc));
5070     float16 neg_real = float16_chs(neg_imag);
5071 
5072     do {
5073         uint64_t pg = g[(i - 1) >> 6];
5074         do {
5075             float16 e0, e1, e2, e3;
5076 
5077             /* I holds the real index; J holds the imag index.  */
5078             j = i - sizeof(float16);
5079             i -= 2 * sizeof(float16);
5080 
5081             e0 = *(float16 *)(vn + H1_2(i));
5082             e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
5083             e2 = *(float16 *)(vn + H1_2(j));
5084             e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
5085 
5086             if (likely((pg >> (i & 63)) & 1)) {
5087                 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s);
5088             }
5089             if (likely((pg >> (j & 63)) & 1)) {
5090                 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, s);
5091             }
5092         } while (i & 63);
5093     } while (i != 0);
5094 }
5095 
5096 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
5097                          float_status *s, uint32_t desc)
5098 {
5099     intptr_t j, i = simd_oprsz(desc);
5100     uint64_t *g = vg;
5101     float32 neg_imag = float32_set_sign(0, simd_data(desc));
5102     float32 neg_real = float32_chs(neg_imag);
5103 
5104     do {
5105         uint64_t pg = g[(i - 1) >> 6];
5106         do {
5107             float32 e0, e1, e2, e3;
5108 
5109             /* I holds the real index; J holds the imag index.  */
5110             j = i - sizeof(float32);
5111             i -= 2 * sizeof(float32);
5112 
5113             e0 = *(float32 *)(vn + H1_2(i));
5114             e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
5115             e2 = *(float32 *)(vn + H1_2(j));
5116             e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
5117 
5118             if (likely((pg >> (i & 63)) & 1)) {
5119                 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s);
5120             }
5121             if (likely((pg >> (j & 63)) & 1)) {
5122                 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, s);
5123             }
5124         } while (i & 63);
5125     } while (i != 0);
5126 }
5127 
5128 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
5129                          float_status *s, uint32_t desc)
5130 {
5131     intptr_t j, i = simd_oprsz(desc);
5132     uint64_t *g = vg;
5133     float64 neg_imag = float64_set_sign(0, simd_data(desc));
5134     float64 neg_real = float64_chs(neg_imag);
5135 
5136     do {
5137         uint64_t pg = g[(i - 1) >> 6];
5138         do {
5139             float64 e0, e1, e2, e3;
5140 
5141             /* I holds the real index; J holds the imag index.  */
5142             j = i - sizeof(float64);
5143             i -= 2 * sizeof(float64);
5144 
5145             e0 = *(float64 *)(vn + H1_2(i));
5146             e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
5147             e2 = *(float64 *)(vn + H1_2(j));
5148             e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
5149 
5150             if (likely((pg >> (i & 63)) & 1)) {
5151                 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s);
5152             }
5153             if (likely((pg >> (j & 63)) & 1)) {
5154                 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, s);
5155             }
5156         } while (i & 63);
5157     } while (i != 0);
5158 }
5159 
5160 /*
5161  * FP Complex Multiply
5162  */
5163 
5164 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
5165                                void *vg, float_status *status, uint32_t desc)
5166 {
5167     intptr_t j, i = simd_oprsz(desc);
5168     unsigned rot = simd_data(desc);
5169     bool flip = rot & 1;
5170     float16 neg_imag, neg_real;
5171     uint64_t *g = vg;
5172 
5173     neg_imag = float16_set_sign(0, (rot & 2) != 0);
5174     neg_real = float16_set_sign(0, rot == 1 || rot == 2);
5175 
5176     do {
5177         uint64_t pg = g[(i - 1) >> 6];
5178         do {
5179             float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
5180 
5181             /* I holds the real index; J holds the imag index.  */
5182             j = i - sizeof(float16);
5183             i -= 2 * sizeof(float16);
5184 
5185             nr = *(float16 *)(vn + H1_2(i));
5186             ni = *(float16 *)(vn + H1_2(j));
5187             mr = *(float16 *)(vm + H1_2(i));
5188             mi = *(float16 *)(vm + H1_2(j));
5189 
5190             e2 = (flip ? ni : nr);
5191             e1 = (flip ? mi : mr) ^ neg_real;
5192             e4 = e2;
5193             e3 = (flip ? mr : mi) ^ neg_imag;
5194 
5195             if (likely((pg >> (i & 63)) & 1)) {
5196                 d = *(float16 *)(va + H1_2(i));
5197                 d = float16_muladd(e2, e1, d, 0, status);
5198                 *(float16 *)(vd + H1_2(i)) = d;
5199             }
5200             if (likely((pg >> (j & 63)) & 1)) {
5201                 d = *(float16 *)(va + H1_2(j));
5202                 d = float16_muladd(e4, e3, d, 0, status);
5203                 *(float16 *)(vd + H1_2(j)) = d;
5204             }
5205         } while (i & 63);
5206     } while (i != 0);
5207 }
5208 
5209 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
5210                                void *vg, float_status *status, uint32_t desc)
5211 {
5212     intptr_t j, i = simd_oprsz(desc);
5213     unsigned rot = simd_data(desc);
5214     bool flip = rot & 1;
5215     float32 neg_imag, neg_real;
5216     uint64_t *g = vg;
5217 
5218     neg_imag = float32_set_sign(0, (rot & 2) != 0);
5219     neg_real = float32_set_sign(0, rot == 1 || rot == 2);
5220 
5221     do {
5222         uint64_t pg = g[(i - 1) >> 6];
5223         do {
5224             float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
5225 
5226             /* I holds the real index; J holds the imag index.  */
5227             j = i - sizeof(float32);
5228             i -= 2 * sizeof(float32);
5229 
5230             nr = *(float32 *)(vn + H1_2(i));
5231             ni = *(float32 *)(vn + H1_2(j));
5232             mr = *(float32 *)(vm + H1_2(i));
5233             mi = *(float32 *)(vm + H1_2(j));
5234 
5235             e2 = (flip ? ni : nr);
5236             e1 = (flip ? mi : mr) ^ neg_real;
5237             e4 = e2;
5238             e3 = (flip ? mr : mi) ^ neg_imag;
5239 
5240             if (likely((pg >> (i & 63)) & 1)) {
5241                 d = *(float32 *)(va + H1_2(i));
5242                 d = float32_muladd(e2, e1, d, 0, status);
5243                 *(float32 *)(vd + H1_2(i)) = d;
5244             }
5245             if (likely((pg >> (j & 63)) & 1)) {
5246                 d = *(float32 *)(va + H1_2(j));
5247                 d = float32_muladd(e4, e3, d, 0, status);
5248                 *(float32 *)(vd + H1_2(j)) = d;
5249             }
5250         } while (i & 63);
5251     } while (i != 0);
5252 }
5253 
5254 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
5255                                void *vg, float_status *status, uint32_t desc)
5256 {
5257     intptr_t j, i = simd_oprsz(desc);
5258     unsigned rot = simd_data(desc);
5259     bool flip = rot & 1;
5260     float64 neg_imag, neg_real;
5261     uint64_t *g = vg;
5262 
5263     neg_imag = float64_set_sign(0, (rot & 2) != 0);
5264     neg_real = float64_set_sign(0, rot == 1 || rot == 2);
5265 
5266     do {
5267         uint64_t pg = g[(i - 1) >> 6];
5268         do {
5269             float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
5270 
5271             /* I holds the real index; J holds the imag index.  */
5272             j = i - sizeof(float64);
5273             i -= 2 * sizeof(float64);
5274 
5275             nr = *(float64 *)(vn + H1_2(i));
5276             ni = *(float64 *)(vn + H1_2(j));
5277             mr = *(float64 *)(vm + H1_2(i));
5278             mi = *(float64 *)(vm + H1_2(j));
5279 
5280             e2 = (flip ? ni : nr);
5281             e1 = (flip ? mi : mr) ^ neg_real;
5282             e4 = e2;
5283             e3 = (flip ? mr : mi) ^ neg_imag;
5284 
5285             if (likely((pg >> (i & 63)) & 1)) {
5286                 d = *(float64 *)(va + H1_2(i));
5287                 d = float64_muladd(e2, e1, d, 0, status);
5288                 *(float64 *)(vd + H1_2(i)) = d;
5289             }
5290             if (likely((pg >> (j & 63)) & 1)) {
5291                 d = *(float64 *)(va + H1_2(j));
5292                 d = float64_muladd(e4, e3, d, 0, status);
5293                 *(float64 *)(vd + H1_2(j)) = d;
5294             }
5295         } while (i & 63);
5296     } while (i != 0);
5297 }
5298 
5299 /*
5300  * Load contiguous data, protected by a governing predicate.
5301  */
5302 
5303 /*
5304  * Skip through a sequence of inactive elements in the guarding predicate @vg,
5305  * beginning at @reg_off bounded by @reg_max.  Return the offset of the active
5306  * element >= @reg_off, or @reg_max if there were no active elements at all.
5307  */
5308 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5309                                  intptr_t reg_max, int esz)
5310 {
5311     uint64_t pg_mask = pred_esz_masks[esz];
5312     uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5313 
5314     /* In normal usage, the first element is active.  */
5315     if (likely(pg & 1)) {
5316         return reg_off;
5317     }
5318 
5319     if (pg == 0) {
5320         reg_off &= -64;
5321         do {
5322             reg_off += 64;
5323             if (unlikely(reg_off >= reg_max)) {
5324                 /* The entire predicate was false.  */
5325                 return reg_max;
5326             }
5327             pg = vg[reg_off >> 6] & pg_mask;
5328         } while (pg == 0);
5329     }
5330     reg_off += ctz64(pg);
5331 
5332     /* We should never see an out of range predicate bit set.  */
5333     tcg_debug_assert(reg_off < reg_max);
5334     return reg_off;
5335 }
5336 
5337 /*
5338  * Resolve the guest virtual address to info->host and info->flags.
5339  * If @nofault, return false if the page is invalid, otherwise
5340  * exit via page fault exception.
5341  */
5342 
5343 bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
5344                     target_ulong addr, int mem_off, MMUAccessType access_type,
5345                     int mmu_idx, uintptr_t retaddr)
5346 {
5347     int flags;
5348 
5349     addr += mem_off;
5350 
5351     /*
5352      * User-only currently always issues with TBI.  See the comment
5353      * above useronly_clean_ptr.  Usually we clean this top byte away
5354      * during translation, but we can't do that for e.g. vector + imm
5355      * addressing modes.
5356      *
5357      * We currently always enable TBI for user-only, and do not provide
5358      * a way to turn it off.  So clean the pointer unconditionally here,
5359      * rather than look it up here, or pass it down from above.
5360      */
5361     addr = useronly_clean_ptr(addr);
5362 
5363 #ifdef CONFIG_USER_ONLY
5364     flags = probe_access_flags(env, addr, 0, access_type, mmu_idx, nofault,
5365                                &info->host, retaddr);
5366 #else
5367     CPUTLBEntryFull *full;
5368     flags = probe_access_full(env, addr, 0, access_type, mmu_idx, nofault,
5369                               &info->host, &full, retaddr);
5370 #endif
5371     info->flags = flags;
5372 
5373     if (flags & TLB_INVALID_MASK) {
5374         g_assert(nofault);
5375         return false;
5376     }
5377 
5378 #ifdef CONFIG_USER_ONLY
5379     memset(&info->attrs, 0, sizeof(info->attrs));
5380     /* Require both ANON and MTE; see allocation_tag_mem(). */
5381     info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
5382 #else
5383     info->attrs = full->attrs;
5384     info->tagged = full->extra.arm.pte_attrs == 0xf0;
5385 #endif
5386 
5387     /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5388     info->host -= mem_off;
5389     return true;
5390 }
5391 
5392 /*
5393  * Find first active element on each page, and a loose bound for the
5394  * final element on each page.  Identify any single element that spans
5395  * the page boundary.  Return true if there are any active elements.
5396  */
5397 bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
5398                             intptr_t reg_max, int esz, int msize)
5399 {
5400     const int esize = 1 << esz;
5401     const uint64_t pg_mask = pred_esz_masks[esz];
5402     intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5403     intptr_t mem_off_last, mem_off_split;
5404     intptr_t page_split, elt_split;
5405     intptr_t i;
5406 
5407     /* Set all of the element indices to -1, and the TLB data to 0. */
5408     memset(info, -1, offsetof(SVEContLdSt, page));
5409     memset(info->page, 0, sizeof(info->page));
5410 
5411     /* Gross scan over the entire predicate to find bounds. */
5412     i = 0;
5413     do {
5414         uint64_t pg = vg[i] & pg_mask;
5415         if (pg) {
5416             reg_off_last = i * 64 + 63 - clz64(pg);
5417             if (reg_off_first < 0) {
5418                 reg_off_first = i * 64 + ctz64(pg);
5419             }
5420         }
5421     } while (++i * 64 < reg_max);
5422 
5423     if (unlikely(reg_off_first < 0)) {
5424         /* No active elements, no pages touched. */
5425         return false;
5426     }
5427     tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5428 
5429     info->reg_off_first[0] = reg_off_first;
5430     info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5431     mem_off_last = (reg_off_last >> esz) * msize;
5432 
5433     page_split = -(addr | TARGET_PAGE_MASK);
5434     if (likely(mem_off_last + msize <= page_split)) {
5435         /* The entire operation fits within a single page. */
5436         info->reg_off_last[0] = reg_off_last;
5437         return true;
5438     }
5439 
5440     info->page_split = page_split;
5441     elt_split = page_split / msize;
5442     reg_off_split = elt_split << esz;
5443     mem_off_split = elt_split * msize;
5444 
5445     /*
5446      * This is the last full element on the first page, but it is not
5447      * necessarily active.  If there is no full element, i.e. the first
5448      * active element is the one that's split, this value remains -1.
5449      * It is useful as iteration bounds.
5450      */
5451     if (elt_split != 0) {
5452         info->reg_off_last[0] = reg_off_split - esize;
5453     }
5454 
5455     /* Determine if an unaligned element spans the pages.  */
5456     if (page_split % msize != 0) {
5457         /* It is helpful to know if the split element is active. */
5458         if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5459             info->reg_off_split = reg_off_split;
5460             info->mem_off_split = mem_off_split;
5461 
5462             if (reg_off_split == reg_off_last) {
5463                 /* The page crossing element is last. */
5464                 return true;
5465             }
5466         }
5467         reg_off_split += esize;
5468         mem_off_split += msize;
5469     }
5470 
5471     /*
5472      * We do want the first active element on the second page, because
5473      * this may affect the address reported in an exception.
5474      */
5475     reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5476     tcg_debug_assert(reg_off_split <= reg_off_last);
5477     info->reg_off_first[1] = reg_off_split;
5478     info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5479     info->reg_off_last[1] = reg_off_last;
5480     return true;
5481 }
5482 
5483 /*
5484  * Resolve the guest virtual addresses to info->page[].
5485  * Control the generation of page faults with @fault.  Return false if
5486  * there is no work to do, which can only happen with @fault == FAULT_NO.
5487  */
5488 bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5489                          CPUARMState *env, target_ulong addr,
5490                          MMUAccessType access_type, uintptr_t retaddr)
5491 {
5492     int mmu_idx = arm_env_mmu_index(env);
5493     int mem_off = info->mem_off_first[0];
5494     bool nofault = fault == FAULT_NO;
5495     bool have_work = true;
5496 
5497     if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5498                         access_type, mmu_idx, retaddr)) {
5499         /* No work to be done. */
5500         return false;
5501     }
5502 
5503     if (likely(info->page_split < 0)) {
5504         /* The entire operation was on the one page. */
5505         return true;
5506     }
5507 
5508     /*
5509      * If the second page is invalid, then we want the fault address to be
5510      * the first byte on that page which is accessed.
5511      */
5512     if (info->mem_off_split >= 0) {
5513         /*
5514          * There is an element split across the pages.  The fault address
5515          * should be the first byte of the second page.
5516          */
5517         mem_off = info->page_split;
5518         /*
5519          * If the split element is also the first active element
5520          * of the vector, then:  For first-fault we should continue
5521          * to generate faults for the second page.  For no-fault,
5522          * we have work only if the second page is valid.
5523          */
5524         if (info->mem_off_first[0] < info->mem_off_split) {
5525             nofault = FAULT_FIRST;
5526             have_work = false;
5527         }
5528     } else {
5529         /*
5530          * There is no element split across the pages.  The fault address
5531          * should be the first active element on the second page.
5532          */
5533         mem_off = info->mem_off_first[1];
5534         /*
5535          * There must have been one active element on the first page,
5536          * so we're out of first-fault territory.
5537          */
5538         nofault = fault != FAULT_ALL;
5539     }
5540 
5541     have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5542                                 access_type, mmu_idx, retaddr);
5543     return have_work;
5544 }
5545 
5546 #ifndef CONFIG_USER_ONLY
5547 void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5548                                uint64_t *vg, target_ulong addr,
5549                                int esize, int msize, int wp_access,
5550                                uintptr_t retaddr)
5551 {
5552     intptr_t mem_off, reg_off, reg_last;
5553     int flags0 = info->page[0].flags;
5554     int flags1 = info->page[1].flags;
5555 
5556     if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5557         return;
5558     }
5559 
5560     /* Indicate that watchpoints are handled. */
5561     info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5562     info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5563 
5564     if (flags0 & TLB_WATCHPOINT) {
5565         mem_off = info->mem_off_first[0];
5566         reg_off = info->reg_off_first[0];
5567         reg_last = info->reg_off_last[0];
5568 
5569         while (reg_off <= reg_last) {
5570             uint64_t pg = vg[reg_off >> 6];
5571             do {
5572                 if ((pg >> (reg_off & 63)) & 1) {
5573                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5574                                          msize, info->page[0].attrs,
5575                                          wp_access, retaddr);
5576                 }
5577                 reg_off += esize;
5578                 mem_off += msize;
5579             } while (reg_off <= reg_last && (reg_off & 63));
5580         }
5581     }
5582 
5583     mem_off = info->mem_off_split;
5584     if (mem_off >= 0) {
5585         cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5586                              info->page[0].attrs, wp_access, retaddr);
5587     }
5588 
5589     mem_off = info->mem_off_first[1];
5590     if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5591         reg_off = info->reg_off_first[1];
5592         reg_last = info->reg_off_last[1];
5593 
5594         do {
5595             uint64_t pg = vg[reg_off >> 6];
5596             do {
5597                 if ((pg >> (reg_off & 63)) & 1) {
5598                     cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5599                                          msize, info->page[1].attrs,
5600                                          wp_access, retaddr);
5601                 }
5602                 reg_off += esize;
5603                 mem_off += msize;
5604             } while (reg_off & 63);
5605         } while (reg_off <= reg_last);
5606     }
5607 }
5608 #endif
5609 
5610 void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5611                              uint64_t *vg, target_ulong addr, int esize,
5612                              int msize, uint32_t mtedesc, uintptr_t ra)
5613 {
5614     intptr_t mem_off, reg_off, reg_last;
5615 
5616     /* Process the page only if MemAttr == Tagged. */
5617     if (info->page[0].tagged) {
5618         mem_off = info->mem_off_first[0];
5619         reg_off = info->reg_off_first[0];
5620         reg_last = info->reg_off_split;
5621         if (reg_last < 0) {
5622             reg_last = info->reg_off_last[0];
5623         }
5624 
5625         do {
5626             uint64_t pg = vg[reg_off >> 6];
5627             do {
5628                 if ((pg >> (reg_off & 63)) & 1) {
5629                     mte_check(env, mtedesc, addr, ra);
5630                 }
5631                 reg_off += esize;
5632                 mem_off += msize;
5633             } while (reg_off <= reg_last && (reg_off & 63));
5634         } while (reg_off <= reg_last);
5635     }
5636 
5637     mem_off = info->mem_off_first[1];
5638     if (mem_off >= 0 && info->page[1].tagged) {
5639         reg_off = info->reg_off_first[1];
5640         reg_last = info->reg_off_last[1];
5641 
5642         do {
5643             uint64_t pg = vg[reg_off >> 6];
5644             do {
5645                 if ((pg >> (reg_off & 63)) & 1) {
5646                     mte_check(env, mtedesc, addr, ra);
5647                 }
5648                 reg_off += esize;
5649                 mem_off += msize;
5650             } while (reg_off & 63);
5651         } while (reg_off <= reg_last);
5652     }
5653 }
5654 
5655 /*
5656  * Common helper for all contiguous 1,2,3,4-register predicated stores.
5657  */
5658 static inline QEMU_ALWAYS_INLINE
5659 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
5660                uint32_t desc, const uintptr_t retaddr,
5661                const int esz, const int msz, const int N, uint32_t mtedesc,
5662                sve_ldst1_host_fn *host_fn,
5663                sve_ldst1_tlb_fn *tlb_fn)
5664 {
5665     const unsigned rd = simd_data(desc);
5666     const intptr_t reg_max = simd_oprsz(desc);
5667     intptr_t reg_off, reg_last, mem_off;
5668     SVEContLdSt info;
5669     void *host;
5670     int flags, i;
5671 
5672     /* Find the active elements.  */
5673     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5674         /* The entire predicate was false; no load occurs.  */
5675         for (i = 0; i < N; ++i) {
5676             memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5677         }
5678         return;
5679     }
5680 
5681     /* Probe the page(s).  Exit with exception for any invalid page. */
5682     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
5683 
5684     /* Handle watchpoints for all active elements. */
5685     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5686                               BP_MEM_READ, retaddr);
5687 
5688     /*
5689      * Handle mte checks for all active elements.
5690      * Since TBI must be set for MTE, !mtedesc => !mte_active.
5691      */
5692     if (mtedesc) {
5693         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5694                                 mtedesc, retaddr);
5695     }
5696 
5697     flags = info.page[0].flags | info.page[1].flags;
5698     if (unlikely(flags != 0)) {
5699         /*
5700          * At least one page includes MMIO.
5701          * Any bus operation can fail with cpu_transaction_failed,
5702          * which for ARM will raise SyncExternal.  Perform the load
5703          * into scratch memory to preserve register state until the end.
5704          */
5705         ARMVectorReg scratch[4] = { };
5706 
5707         mem_off = info.mem_off_first[0];
5708         reg_off = info.reg_off_first[0];
5709         reg_last = info.reg_off_last[1];
5710         if (reg_last < 0) {
5711             reg_last = info.reg_off_split;
5712             if (reg_last < 0) {
5713                 reg_last = info.reg_off_last[0];
5714             }
5715         }
5716 
5717         do {
5718             uint64_t pg = vg[reg_off >> 6];
5719             do {
5720                 if ((pg >> (reg_off & 63)) & 1) {
5721                     for (i = 0; i < N; ++i) {
5722                         tlb_fn(env, &scratch[i], reg_off,
5723                                addr + mem_off + (i << msz), retaddr);
5724                     }
5725                 }
5726                 reg_off += 1 << esz;
5727                 mem_off += N << msz;
5728             } while (reg_off & 63);
5729         } while (reg_off <= reg_last);
5730 
5731         for (i = 0; i < N; ++i) {
5732             memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5733         }
5734         return;
5735     }
5736 
5737     /* The entire operation is in RAM, on valid pages. */
5738 
5739     for (i = 0; i < N; ++i) {
5740         memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5741     }
5742 
5743     mem_off = info.mem_off_first[0];
5744     reg_off = info.reg_off_first[0];
5745     reg_last = info.reg_off_last[0];
5746     host = info.page[0].host;
5747 
5748     set_helper_retaddr(retaddr);
5749 
5750     while (reg_off <= reg_last) {
5751         uint64_t pg = vg[reg_off >> 6];
5752         do {
5753             if ((pg >> (reg_off & 63)) & 1) {
5754                 for (i = 0; i < N; ++i) {
5755                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5756                             host + mem_off + (i << msz));
5757                 }
5758             }
5759             reg_off += 1 << esz;
5760             mem_off += N << msz;
5761         } while (reg_off <= reg_last && (reg_off & 63));
5762     }
5763 
5764     clear_helper_retaddr();
5765 
5766     /*
5767      * Use the slow path to manage the cross-page misalignment.
5768      * But we know this is RAM and cannot trap.
5769      */
5770     mem_off = info.mem_off_split;
5771     if (unlikely(mem_off >= 0)) {
5772         reg_off = info.reg_off_split;
5773         for (i = 0; i < N; ++i) {
5774             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5775                    addr + mem_off + (i << msz), retaddr);
5776         }
5777     }
5778 
5779     mem_off = info.mem_off_first[1];
5780     if (unlikely(mem_off >= 0)) {
5781         reg_off = info.reg_off_first[1];
5782         reg_last = info.reg_off_last[1];
5783         host = info.page[1].host;
5784 
5785         set_helper_retaddr(retaddr);
5786 
5787         do {
5788             uint64_t pg = vg[reg_off >> 6];
5789             do {
5790                 if ((pg >> (reg_off & 63)) & 1) {
5791                     for (i = 0; i < N; ++i) {
5792                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5793                                 host + mem_off + (i << msz));
5794                     }
5795                 }
5796                 reg_off += 1 << esz;
5797                 mem_off += N << msz;
5798             } while (reg_off & 63);
5799         } while (reg_off <= reg_last);
5800 
5801         clear_helper_retaddr();
5802     }
5803 }
5804 
5805 static inline QEMU_ALWAYS_INLINE
5806 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5807                    uint32_t desc, const uintptr_t ra,
5808                    const int esz, const int msz, const int N,
5809                    sve_ldst1_host_fn *host_fn,
5810                    sve_ldst1_tlb_fn *tlb_fn)
5811 {
5812     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5813     int bit55 = extract64(addr, 55, 1);
5814 
5815     /* Remove mtedesc from the normal sve descriptor. */
5816     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5817 
5818     /* Perform gross MTE suppression early. */
5819     if (!tbi_check(mtedesc, bit55) ||
5820         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
5821         mtedesc = 0;
5822     }
5823 
5824     sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
5825 }
5826 
5827 #define DO_LD1_1(NAME, ESZ)                                             \
5828 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg,                 \
5829                             target_ulong addr, uint32_t desc)           \
5830 {                                                                       \
5831     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0,            \
5832               sve_##NAME##_host, sve_##NAME##_tlb);                     \
5833 }                                                                       \
5834 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg,             \
5835                                 target_ulong addr, uint32_t desc)       \
5836 {                                                                       \
5837     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1,           \
5838                   sve_##NAME##_host, sve_##NAME##_tlb);                 \
5839 }
5840 
5841 #define DO_LD1_2(NAME, ESZ, MSZ)                                        \
5842 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg,              \
5843                                target_ulong addr, uint32_t desc)        \
5844 {                                                                       \
5845     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5846               sve_##NAME##_le_host, sve_##NAME##_le_tlb);               \
5847 }                                                                       \
5848 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg,              \
5849                                target_ulong addr, uint32_t desc)        \
5850 {                                                                       \
5851     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0,             \
5852               sve_##NAME##_be_host, sve_##NAME##_be_tlb);               \
5853 }                                                                       \
5854 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg,          \
5855                                    target_ulong addr, uint32_t desc)    \
5856 {                                                                       \
5857     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5858                   sve_##NAME##_le_host, sve_##NAME##_le_tlb);           \
5859 }                                                                       \
5860 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg,          \
5861                                    target_ulong addr, uint32_t desc)    \
5862 {                                                                       \
5863     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1,            \
5864                   sve_##NAME##_be_host, sve_##NAME##_be_tlb);           \
5865 }
5866 
5867 DO_LD1_1(ld1bb,  MO_8)
5868 DO_LD1_1(ld1bhu, MO_16)
5869 DO_LD1_1(ld1bhs, MO_16)
5870 DO_LD1_1(ld1bsu, MO_32)
5871 DO_LD1_1(ld1bss, MO_32)
5872 DO_LD1_1(ld1bdu, MO_64)
5873 DO_LD1_1(ld1bds, MO_64)
5874 
5875 DO_LD1_2(ld1hh,  MO_16, MO_16)
5876 DO_LD1_2(ld1hsu, MO_32, MO_16)
5877 DO_LD1_2(ld1hss, MO_32, MO_16)
5878 DO_LD1_2(ld1hdu, MO_64, MO_16)
5879 DO_LD1_2(ld1hds, MO_64, MO_16)
5880 
5881 DO_LD1_2(ld1ss,  MO_32, MO_32)
5882 DO_LD1_2(ld1sdu, MO_64, MO_32)
5883 DO_LD1_2(ld1sds, MO_64, MO_32)
5884 
5885 DO_LD1_2(ld1dd,  MO_64, MO_64)
5886 
5887 #undef DO_LD1_1
5888 #undef DO_LD1_2
5889 
5890 #define DO_LDN_1(N)                                                     \
5891 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg,                \
5892                              target_ulong addr, uint32_t desc)          \
5893 {                                                                       \
5894     sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0,           \
5895               sve_ld1bb_host, sve_ld1bb_tlb);                           \
5896 }                                                                       \
5897 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg,            \
5898                                  target_ulong addr, uint32_t desc)      \
5899 {                                                                       \
5900     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N,          \
5901                   sve_ld1bb_host, sve_ld1bb_tlb);                       \
5902 }
5903 
5904 #define DO_LDN_2(N, SUFF, ESZ)                                          \
5905 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg,         \
5906                                     target_ulong addr, uint32_t desc)   \
5907 {                                                                       \
5908     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5909               sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);         \
5910 }                                                                       \
5911 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg,         \
5912                                     target_ulong addr, uint32_t desc)   \
5913 {                                                                       \
5914     sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0,             \
5915               sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);         \
5916 }                                                                       \
5917 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg,     \
5918                                         target_ulong addr, uint32_t desc) \
5919 {                                                                       \
5920     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5921                   sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb);     \
5922 }                                                                       \
5923 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg,     \
5924                                         target_ulong addr, uint32_t desc) \
5925 {                                                                       \
5926     sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N,            \
5927                   sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb);     \
5928 }
5929 
5930 DO_LDN_1(2)
5931 DO_LDN_1(3)
5932 DO_LDN_1(4)
5933 
5934 DO_LDN_2(2, hh, MO_16)
5935 DO_LDN_2(3, hh, MO_16)
5936 DO_LDN_2(4, hh, MO_16)
5937 
5938 DO_LDN_2(2, ss, MO_32)
5939 DO_LDN_2(3, ss, MO_32)
5940 DO_LDN_2(4, ss, MO_32)
5941 
5942 DO_LDN_2(2, dd, MO_64)
5943 DO_LDN_2(3, dd, MO_64)
5944 DO_LDN_2(4, dd, MO_64)
5945 
5946 #undef DO_LDN_1
5947 #undef DO_LDN_2
5948 
5949 /*
5950  * Load contiguous data, first-fault and no-fault.
5951  *
5952  * For user-only, we control the race between page_check_range and
5953  * another thread's munmap by using set/clear_helper_retaddr.  Any
5954  * SEGV that occurs between those markers is assumed to be because
5955  * the guest page vanished.  Keep that block as small as possible
5956  * so that unrelated QEMU bugs are not blamed on the guest.
5957  */
5958 
5959 /* Fault on byte I.  All bits in FFR from I are cleared.  The vector
5960  * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5961  * option, which leaves subsequent data unchanged.
5962  */
5963 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5964 {
5965     uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5966 
5967     if (i & 63) {
5968         ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5969         i = ROUND_UP(i, 64);
5970     }
5971     for (; i < oprsz; i += 64) {
5972         ffr[i / 64] = 0;
5973     }
5974 }
5975 
5976 /*
5977  * Common helper for all contiguous no-fault and first-fault loads.
5978  */
5979 static inline QEMU_ALWAYS_INLINE
5980 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
5981                    uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
5982                    const int esz, const int msz, const SVEContFault fault,
5983                    sve_ldst1_host_fn *host_fn,
5984                    sve_ldst1_tlb_fn *tlb_fn)
5985 {
5986     const unsigned rd = simd_data(desc);
5987     void *vd = &env->vfp.zregs[rd];
5988     const intptr_t reg_max = simd_oprsz(desc);
5989     intptr_t reg_off, mem_off, reg_last;
5990     SVEContLdSt info;
5991     int flags;
5992     void *host;
5993 
5994     /* Find the active elements.  */
5995     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
5996         /* The entire predicate was false; no load occurs.  */
5997         memset(vd, 0, reg_max);
5998         return;
5999     }
6000     reg_off = info.reg_off_first[0];
6001 
6002     /* Probe the page(s). */
6003     if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
6004         /* Fault on first element. */
6005         tcg_debug_assert(fault == FAULT_NO);
6006         memset(vd, 0, reg_max);
6007         goto do_fault;
6008     }
6009 
6010     mem_off = info.mem_off_first[0];
6011     flags = info.page[0].flags;
6012 
6013     /*
6014      * Disable MTE checking if the Tagged bit is not set.  Since TBI must
6015      * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6016      */
6017     if (!info.page[0].tagged) {
6018         mtedesc = 0;
6019     }
6020 
6021     if (fault == FAULT_FIRST) {
6022         /* Trapping mte check for the first-fault element.  */
6023         if (mtedesc) {
6024             mte_check(env, mtedesc, addr + mem_off, retaddr);
6025         }
6026 
6027         /*
6028          * Special handling of the first active element,
6029          * if it crosses a page boundary or is MMIO.
6030          */
6031         bool is_split = mem_off == info.mem_off_split;
6032         if (unlikely(flags != 0) || unlikely(is_split)) {
6033             /*
6034              * Use the slow path for cross-page handling.
6035              * Might trap for MMIO or watchpoints.
6036              */
6037             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6038 
6039             /* After any fault, zero the other elements. */
6040             swap_memzero(vd, reg_off);
6041             reg_off += 1 << esz;
6042             mem_off += 1 << msz;
6043             swap_memzero(vd + reg_off, reg_max - reg_off);
6044 
6045             if (is_split) {
6046                 goto second_page;
6047             }
6048         } else {
6049             memset(vd, 0, reg_max);
6050         }
6051     } else {
6052         memset(vd, 0, reg_max);
6053         if (unlikely(mem_off == info.mem_off_split)) {
6054             /* The first active element crosses a page boundary. */
6055             flags |= info.page[1].flags;
6056             if (unlikely(flags & TLB_MMIO)) {
6057                 /* Some page is MMIO, see below. */
6058                 goto do_fault;
6059             }
6060             if (unlikely(flags & TLB_WATCHPOINT) &&
6061                 (cpu_watchpoint_address_matches
6062                  (env_cpu(env), addr + mem_off, 1 << msz)
6063                  & BP_MEM_READ)) {
6064                 /* Watchpoint hit, see below. */
6065                 goto do_fault;
6066             }
6067             if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6068                 goto do_fault;
6069             }
6070             /*
6071              * Use the slow path for cross-page handling.
6072              * This is RAM, without a watchpoint, and will not trap.
6073              */
6074             tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
6075             goto second_page;
6076         }
6077     }
6078 
6079     /*
6080      * From this point on, all memory operations are MemSingleNF.
6081      *
6082      * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6083      * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6084      *
6085      * Unfortuately we do not have access to the memory attributes from the
6086      * PTE to tell Device memory from Normal memory.  So we make a mostly
6087      * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6088      * This gives the right answer for the common cases of "Normal memory,
6089      * backed by host RAM" and "Device memory, backed by MMIO".
6090      * The architecture allows us to suppress an NF load and return
6091      * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6092      * case of "Normal memory, backed by MMIO" is permitted.  The case we
6093      * get wrong is "Device memory, backed by host RAM", for which we
6094      * should return (UNKNOWN, FAULT) for but do not.
6095      *
6096      * Similarly, CPU_BP breakpoints would raise exceptions, and so
6097      * return (UNKNOWN, FAULT).  For simplicity, we consider gdb and
6098      * architectural breakpoints the same.
6099      */
6100     if (unlikely(flags & TLB_MMIO)) {
6101         goto do_fault;
6102     }
6103 
6104     reg_last = info.reg_off_last[0];
6105     host = info.page[0].host;
6106 
6107     set_helper_retaddr(retaddr);
6108 
6109     do {
6110         uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
6111         do {
6112             if ((pg >> (reg_off & 63)) & 1) {
6113                 if (unlikely(flags & TLB_WATCHPOINT) &&
6114                     (cpu_watchpoint_address_matches
6115                      (env_cpu(env), addr + mem_off, 1 << msz)
6116                      & BP_MEM_READ)) {
6117                     clear_helper_retaddr();
6118                     goto do_fault;
6119                 }
6120                 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
6121                     clear_helper_retaddr();
6122                     goto do_fault;
6123                 }
6124                 host_fn(vd, reg_off, host + mem_off);
6125             }
6126             reg_off += 1 << esz;
6127             mem_off += 1 << msz;
6128         } while (reg_off <= reg_last && (reg_off & 63));
6129     } while (reg_off <= reg_last);
6130 
6131     clear_helper_retaddr();
6132 
6133     /*
6134      * MemSingleNF is allowed to fail for any reason.  We have special
6135      * code above to handle the first element crossing a page boundary.
6136      * As an implementation choice, decline to handle a cross-page element
6137      * in any other position.
6138      */
6139     reg_off = info.reg_off_split;
6140     if (reg_off >= 0) {
6141         goto do_fault;
6142     }
6143 
6144  second_page:
6145     reg_off = info.reg_off_first[1];
6146     if (likely(reg_off < 0)) {
6147         /* No active elements on the second page.  All done. */
6148         return;
6149     }
6150 
6151     /*
6152      * MemSingleNF is allowed to fail for any reason.  As an implementation
6153      * choice, decline to handle elements on the second page.  This should
6154      * be low frequency as the guest walks through memory -- the next
6155      * iteration of the guest's loop should be aligned on the page boundary,
6156      * and then all following iterations will stay aligned.
6157      */
6158 
6159  do_fault:
6160     record_fault(env, reg_off, reg_max);
6161 }
6162 
6163 static inline QEMU_ALWAYS_INLINE
6164 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6165                        uint32_t desc, const uintptr_t retaddr,
6166                        const int esz, const int msz, const SVEContFault fault,
6167                        sve_ldst1_host_fn *host_fn,
6168                        sve_ldst1_tlb_fn *tlb_fn)
6169 {
6170     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6171     int bit55 = extract64(addr, 55, 1);
6172 
6173     /* Remove mtedesc from the normal sve descriptor. */
6174     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6175 
6176     /* Perform gross MTE suppression early. */
6177     if (!tbi_check(mtedesc, bit55) ||
6178         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6179         mtedesc = 0;
6180     }
6181 
6182     sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6183                   esz, msz, fault, host_fn, tlb_fn);
6184 }
6185 
6186 #define DO_LDFF1_LDNF1_1(PART, ESZ)                                     \
6187 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg,            \
6188                                  target_ulong addr, uint32_t desc)      \
6189 {                                                                       \
6190     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6191                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6192 }                                                                       \
6193 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg,            \
6194                                  target_ulong addr, uint32_t desc)      \
6195 {                                                                       \
6196     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6197                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6198 }                                                                       \
6199 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6200                                      target_ulong addr, uint32_t desc)  \
6201 {                                                                       \
6202     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6203                       sve_ld1##PART##_host, sve_ld1##PART##_tlb);       \
6204 }                                                                       \
6205 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg,        \
6206                                      target_ulong addr, uint32_t desc)  \
6207 {                                                                       \
6208     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6209                   sve_ld1##PART##_host, sve_ld1##PART##_tlb);           \
6210 }
6211 
6212 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ)                                \
6213 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg,         \
6214                                     target_ulong addr, uint32_t desc)   \
6215 {                                                                       \
6216     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6217                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6218 }                                                                       \
6219 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg,         \
6220                                     target_ulong addr, uint32_t desc)   \
6221 {                                                                       \
6222     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6223                   sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb);     \
6224 }                                                                       \
6225 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg,         \
6226                                     target_ulong addr, uint32_t desc)   \
6227 {                                                                       \
6228     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6229                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6230 }                                                                       \
6231 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg,         \
6232                                     target_ulong addr, uint32_t desc)   \
6233 {                                                                       \
6234     sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO,  \
6235                   sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb);     \
6236 }                                                                       \
6237 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6238                                         target_ulong addr, uint32_t desc) \
6239 {                                                                       \
6240     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6241                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6242 }                                                                       \
6243 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg,     \
6244                                         target_ulong addr, uint32_t desc) \
6245 {                                                                       \
6246     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6247                       sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6248 }                                                                       \
6249 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6250                                         target_ulong addr, uint32_t desc) \
6251 {                                                                       \
6252     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6253                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6254 }                                                                       \
6255 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg,     \
6256                                         target_ulong addr, uint32_t desc) \
6257 {                                                                       \
6258     sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6259                       sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6260 }
6261 
6262 DO_LDFF1_LDNF1_1(bb,  MO_8)
6263 DO_LDFF1_LDNF1_1(bhu, MO_16)
6264 DO_LDFF1_LDNF1_1(bhs, MO_16)
6265 DO_LDFF1_LDNF1_1(bsu, MO_32)
6266 DO_LDFF1_LDNF1_1(bss, MO_32)
6267 DO_LDFF1_LDNF1_1(bdu, MO_64)
6268 DO_LDFF1_LDNF1_1(bds, MO_64)
6269 
6270 DO_LDFF1_LDNF1_2(hh,  MO_16, MO_16)
6271 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6272 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6273 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6274 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
6275 
6276 DO_LDFF1_LDNF1_2(ss,  MO_32, MO_32)
6277 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6278 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
6279 
6280 DO_LDFF1_LDNF1_2(dd,  MO_64, MO_64)
6281 
6282 #undef DO_LDFF1_LDNF1_1
6283 #undef DO_LDFF1_LDNF1_2
6284 
6285 /*
6286  * Common helper for all contiguous 1,2,3,4-register predicated stores.
6287  */
6288 
6289 static inline QEMU_ALWAYS_INLINE
6290 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6291                uint32_t desc, const uintptr_t retaddr,
6292                const int esz, const int msz, const int N, uint32_t mtedesc,
6293                sve_ldst1_host_fn *host_fn,
6294                sve_ldst1_tlb_fn *tlb_fn)
6295 {
6296     const unsigned rd = simd_data(desc);
6297     const intptr_t reg_max = simd_oprsz(desc);
6298     intptr_t reg_off, reg_last, mem_off;
6299     SVEContLdSt info;
6300     void *host;
6301     int i, flags;
6302 
6303     /* Find the active elements.  */
6304     if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6305         /* The entire predicate was false; no store occurs.  */
6306         return;
6307     }
6308 
6309     /* Probe the page(s).  Exit with exception for any invalid page. */
6310     sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
6311 
6312     /* Handle watchpoints for all active elements. */
6313     sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6314                               BP_MEM_WRITE, retaddr);
6315 
6316     /*
6317      * Handle mte checks for all active elements.
6318      * Since TBI must be set for MTE, !mtedesc => !mte_active.
6319      */
6320     if (mtedesc) {
6321         sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6322                                 mtedesc, retaddr);
6323     }
6324 
6325     flags = info.page[0].flags | info.page[1].flags;
6326     if (unlikely(flags != 0)) {
6327         /*
6328          * At least one page includes MMIO.
6329          * Any bus operation can fail with cpu_transaction_failed,
6330          * which for ARM will raise SyncExternal.  We cannot avoid
6331          * this fault and will leave with the store incomplete.
6332          */
6333         mem_off = info.mem_off_first[0];
6334         reg_off = info.reg_off_first[0];
6335         reg_last = info.reg_off_last[1];
6336         if (reg_last < 0) {
6337             reg_last = info.reg_off_split;
6338             if (reg_last < 0) {
6339                 reg_last = info.reg_off_last[0];
6340             }
6341         }
6342 
6343         do {
6344             uint64_t pg = vg[reg_off >> 6];
6345             do {
6346                 if ((pg >> (reg_off & 63)) & 1) {
6347                     for (i = 0; i < N; ++i) {
6348                         tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6349                                addr + mem_off + (i << msz), retaddr);
6350                     }
6351                 }
6352                 reg_off += 1 << esz;
6353                 mem_off += N << msz;
6354             } while (reg_off & 63);
6355         } while (reg_off <= reg_last);
6356         return;
6357     }
6358 
6359     mem_off = info.mem_off_first[0];
6360     reg_off = info.reg_off_first[0];
6361     reg_last = info.reg_off_last[0];
6362     host = info.page[0].host;
6363 
6364     set_helper_retaddr(retaddr);
6365 
6366     while (reg_off <= reg_last) {
6367         uint64_t pg = vg[reg_off >> 6];
6368         do {
6369             if ((pg >> (reg_off & 63)) & 1) {
6370                 for (i = 0; i < N; ++i) {
6371                     host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6372                             host + mem_off + (i << msz));
6373                 }
6374             }
6375             reg_off += 1 << esz;
6376             mem_off += N << msz;
6377         } while (reg_off <= reg_last && (reg_off & 63));
6378     }
6379 
6380     clear_helper_retaddr();
6381 
6382     /*
6383      * Use the slow path to manage the cross-page misalignment.
6384      * But we know this is RAM and cannot trap.
6385      */
6386     mem_off = info.mem_off_split;
6387     if (unlikely(mem_off >= 0)) {
6388         reg_off = info.reg_off_split;
6389         for (i = 0; i < N; ++i) {
6390             tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6391                    addr + mem_off + (i << msz), retaddr);
6392         }
6393     }
6394 
6395     mem_off = info.mem_off_first[1];
6396     if (unlikely(mem_off >= 0)) {
6397         reg_off = info.reg_off_first[1];
6398         reg_last = info.reg_off_last[1];
6399         host = info.page[1].host;
6400 
6401         set_helper_retaddr(retaddr);
6402 
6403         do {
6404             uint64_t pg = vg[reg_off >> 6];
6405             do {
6406                 if ((pg >> (reg_off & 63)) & 1) {
6407                     for (i = 0; i < N; ++i) {
6408                         host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6409                                 host + mem_off + (i << msz));
6410                     }
6411                 }
6412                 reg_off += 1 << esz;
6413                 mem_off += N << msz;
6414             } while (reg_off & 63);
6415         } while (reg_off <= reg_last);
6416 
6417         clear_helper_retaddr();
6418     }
6419 }
6420 
6421 static inline QEMU_ALWAYS_INLINE
6422 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6423                    uint32_t desc, const uintptr_t ra,
6424                    const int esz, const int msz, const int N,
6425                    sve_ldst1_host_fn *host_fn,
6426                    sve_ldst1_tlb_fn *tlb_fn)
6427 {
6428     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6429     int bit55 = extract64(addr, 55, 1);
6430 
6431     /* Remove mtedesc from the normal sve descriptor. */
6432     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6433 
6434     /* Perform gross MTE suppression early. */
6435     if (!tbi_check(mtedesc, bit55) ||
6436         tcma_check(mtedesc, bit55, allocation_tag_from_addr(addr))) {
6437         mtedesc = 0;
6438     }
6439 
6440     sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
6441 }
6442 
6443 #define DO_STN_1(N, NAME, ESZ)                                          \
6444 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg,            \
6445                                  target_ulong addr, uint32_t desc)      \
6446 {                                                                       \
6447     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0,            \
6448               sve_st1##NAME##_host, sve_st1##NAME##_tlb);               \
6449 }                                                                       \
6450 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg,        \
6451                                      target_ulong addr, uint32_t desc)  \
6452 {                                                                       \
6453     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N,           \
6454                   sve_st1##NAME##_host, sve_st1##NAME##_tlb);           \
6455 }
6456 
6457 #define DO_STN_2(N, NAME, ESZ, MSZ)                                     \
6458 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg,         \
6459                                     target_ulong addr, uint32_t desc)   \
6460 {                                                                       \
6461     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6462               sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);         \
6463 }                                                                       \
6464 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg,         \
6465                                     target_ulong addr, uint32_t desc)   \
6466 {                                                                       \
6467     sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0,             \
6468               sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);         \
6469 }                                                                       \
6470 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg,     \
6471                                         target_ulong addr, uint32_t desc) \
6472 {                                                                       \
6473     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6474                   sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb);     \
6475 }                                                                       \
6476 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg,     \
6477                                         target_ulong addr, uint32_t desc) \
6478 {                                                                       \
6479     sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N,            \
6480                   sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb);     \
6481 }
6482 
6483 DO_STN_1(1, bb, MO_8)
6484 DO_STN_1(1, bh, MO_16)
6485 DO_STN_1(1, bs, MO_32)
6486 DO_STN_1(1, bd, MO_64)
6487 DO_STN_1(2, bb, MO_8)
6488 DO_STN_1(3, bb, MO_8)
6489 DO_STN_1(4, bb, MO_8)
6490 
6491 DO_STN_2(1, hh, MO_16, MO_16)
6492 DO_STN_2(1, hs, MO_32, MO_16)
6493 DO_STN_2(1, hd, MO_64, MO_16)
6494 DO_STN_2(2, hh, MO_16, MO_16)
6495 DO_STN_2(3, hh, MO_16, MO_16)
6496 DO_STN_2(4, hh, MO_16, MO_16)
6497 
6498 DO_STN_2(1, ss, MO_32, MO_32)
6499 DO_STN_2(1, sd, MO_64, MO_32)
6500 DO_STN_2(2, ss, MO_32, MO_32)
6501 DO_STN_2(3, ss, MO_32, MO_32)
6502 DO_STN_2(4, ss, MO_32, MO_32)
6503 
6504 DO_STN_2(1, dd, MO_64, MO_64)
6505 DO_STN_2(2, dd, MO_64, MO_64)
6506 DO_STN_2(3, dd, MO_64, MO_64)
6507 DO_STN_2(4, dd, MO_64, MO_64)
6508 
6509 #undef DO_STN_1
6510 #undef DO_STN_2
6511 
6512 /*
6513  * Loads with a vector index.
6514  */
6515 
6516 /*
6517  * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6518  */
6519 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6520 
6521 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6522 {
6523     return *(uint32_t *)(reg + H1_4(reg_ofs));
6524 }
6525 
6526 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6527 {
6528     return *(int32_t *)(reg + H1_4(reg_ofs));
6529 }
6530 
6531 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6532 {
6533     return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6534 }
6535 
6536 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6537 {
6538     return (int32_t)*(uint64_t *)(reg + reg_ofs);
6539 }
6540 
6541 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6542 {
6543     return *(uint64_t *)(reg + reg_ofs);
6544 }
6545 
6546 static inline QEMU_ALWAYS_INLINE
6547 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6548                target_ulong base, uint32_t desc, uintptr_t retaddr,
6549                uint32_t mtedesc, int esize, int msize,
6550                zreg_off_fn *off_fn,
6551                sve_ldst1_host_fn *host_fn,
6552                sve_ldst1_tlb_fn *tlb_fn)
6553 {
6554     const int mmu_idx = arm_env_mmu_index(env);
6555     const intptr_t reg_max = simd_oprsz(desc);
6556     const int scale = simd_data(desc);
6557     ARMVectorReg scratch;
6558     intptr_t reg_off;
6559     SVEHostPage info, info2;
6560 
6561     memset(&scratch, 0, reg_max);
6562     reg_off = 0;
6563     do {
6564         uint64_t pg = vg[reg_off >> 6];
6565         do {
6566             if (likely(pg & 1)) {
6567                 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6568                 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6569 
6570                 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6571                                mmu_idx, retaddr);
6572 
6573                 if (likely(in_page >= msize)) {
6574                     if (unlikely(info.flags & TLB_WATCHPOINT)) {
6575                         cpu_check_watchpoint(env_cpu(env), addr, msize,
6576                                              info.attrs, BP_MEM_READ, retaddr);
6577                     }
6578                     if (mtedesc && info.tagged) {
6579                         mte_check(env, mtedesc, addr, retaddr);
6580                     }
6581                     if (unlikely(info.flags & TLB_MMIO)) {
6582                         tlb_fn(env, &scratch, reg_off, addr, retaddr);
6583                     } else {
6584                         set_helper_retaddr(retaddr);
6585                         host_fn(&scratch, reg_off, info.host);
6586                         clear_helper_retaddr();
6587                     }
6588                 } else {
6589                     /* Element crosses the page boundary. */
6590                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6591                                    MMU_DATA_LOAD, mmu_idx, retaddr);
6592                     if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6593                         cpu_check_watchpoint(env_cpu(env), addr,
6594                                              msize, info.attrs,
6595                                              BP_MEM_READ, retaddr);
6596                     }
6597                     if (mtedesc && info.tagged) {
6598                         mte_check(env, mtedesc, addr, retaddr);
6599                     }
6600                     tlb_fn(env, &scratch, reg_off, addr, retaddr);
6601                 }
6602             }
6603             reg_off += esize;
6604             pg >>= esize;
6605         } while (reg_off & 63);
6606     } while (reg_off < reg_max);
6607 
6608     /* Wait until all exceptions have been raised to write back.  */
6609     memcpy(vd, &scratch, reg_max);
6610 }
6611 
6612 static inline QEMU_ALWAYS_INLINE
6613 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6614                    target_ulong base, uint32_t desc, uintptr_t retaddr,
6615                    int esize, int msize, zreg_off_fn *off_fn,
6616                    sve_ldst1_host_fn *host_fn,
6617                    sve_ldst1_tlb_fn *tlb_fn)
6618 {
6619     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6620     /* Remove mtedesc from the normal sve descriptor. */
6621     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6622 
6623     /*
6624      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6625      * offset base entirely over the address space hole to change the
6626      * pointer tag, or change the bit55 selector.  So we could here
6627      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6628      */
6629     sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6630               esize, msize, off_fn, host_fn, tlb_fn);
6631 }
6632 
6633 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6634 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6635                                  void *vm, target_ulong base, uint32_t desc) \
6636 {                                                                            \
6637     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,          \
6638               off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6639 }                                                                            \
6640 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6641      void *vm, target_ulong base, uint32_t desc)                             \
6642 {                                                                            \
6643     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,         \
6644                   off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6645 }
6646 
6647 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6648 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,       \
6649                                  void *vm, target_ulong base, uint32_t desc) \
6650 {                                                                            \
6651     sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,          \
6652               off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);       \
6653 }                                                                            \
6654 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6655     void *vm, target_ulong base, uint32_t desc)                              \
6656 {                                                                            \
6657     sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,         \
6658                   off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb);   \
6659 }
6660 
6661 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6662 DO_LD1_ZPZ_S(bsu, zss, MO_8)
6663 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6664 DO_LD1_ZPZ_D(bdu, zss, MO_8)
6665 DO_LD1_ZPZ_D(bdu, zd, MO_8)
6666 
6667 DO_LD1_ZPZ_S(bss, zsu, MO_8)
6668 DO_LD1_ZPZ_S(bss, zss, MO_8)
6669 DO_LD1_ZPZ_D(bds, zsu, MO_8)
6670 DO_LD1_ZPZ_D(bds, zss, MO_8)
6671 DO_LD1_ZPZ_D(bds, zd, MO_8)
6672 
6673 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6674 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6675 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6676 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6677 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6678 
6679 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6680 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6681 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6682 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6683 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6684 
6685 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6686 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6687 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6688 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6689 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6690 
6691 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6692 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6693 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6694 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6695 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6696 
6697 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6698 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6699 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6700 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6701 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6702 
6703 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6704 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6705 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6706 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6707 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6708 
6709 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6710 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6711 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6712 
6713 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6714 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6715 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6716 
6717 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6718 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6719 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6720 
6721 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6722 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6723 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
6724 
6725 #undef DO_LD1_ZPZ_S
6726 #undef DO_LD1_ZPZ_D
6727 
6728 /* First fault loads with a vector index.  */
6729 
6730 /*
6731  * Common helpers for all gather first-faulting loads.
6732  */
6733 
6734 static inline QEMU_ALWAYS_INLINE
6735 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6736                  target_ulong base, uint32_t desc, uintptr_t retaddr,
6737                  uint32_t mtedesc, const int esz, const int msz,
6738                  zreg_off_fn *off_fn,
6739                  sve_ldst1_host_fn *host_fn,
6740                  sve_ldst1_tlb_fn *tlb_fn)
6741 {
6742     const int mmu_idx = arm_env_mmu_index(env);
6743     const intptr_t reg_max = simd_oprsz(desc);
6744     const int scale = simd_data(desc);
6745     const int esize = 1 << esz;
6746     const int msize = 1 << msz;
6747     intptr_t reg_off;
6748     SVEHostPage info;
6749     target_ulong addr, in_page;
6750     ARMVectorReg scratch;
6751 
6752     /* Skip to the first true predicate.  */
6753     reg_off = find_next_active(vg, 0, reg_max, esz);
6754     if (unlikely(reg_off >= reg_max)) {
6755         /* The entire predicate was false; no load occurs.  */
6756         memset(vd, 0, reg_max);
6757         return;
6758     }
6759 
6760     /* Protect against overlap between vd and vm. */
6761     if (unlikely(vd == vm)) {
6762         vm = memcpy(&scratch, vm, reg_max);
6763     }
6764 
6765     /*
6766      * Probe the first element, allowing faults.
6767      */
6768     addr = base + (off_fn(vm, reg_off) << scale);
6769     if (mtedesc) {
6770         mte_check(env, mtedesc, addr, retaddr);
6771     }
6772     tlb_fn(env, vd, reg_off, addr, retaddr);
6773 
6774     /* After any fault, zero the other elements. */
6775     swap_memzero(vd, reg_off);
6776     reg_off += esize;
6777     swap_memzero(vd + reg_off, reg_max - reg_off);
6778 
6779     /*
6780      * Probe the remaining elements, not allowing faults.
6781      */
6782     while (reg_off < reg_max) {
6783         uint64_t pg = vg[reg_off >> 6];
6784         do {
6785             if (likely((pg >> (reg_off & 63)) & 1)) {
6786                 addr = base + (off_fn(vm, reg_off) << scale);
6787                 in_page = -(addr | TARGET_PAGE_MASK);
6788 
6789                 if (unlikely(in_page < msize)) {
6790                     /* Stop if the element crosses a page boundary. */
6791                     goto fault;
6792                 }
6793 
6794                 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6795                                mmu_idx, retaddr);
6796                 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6797                     goto fault;
6798                 }
6799                 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6800                     (cpu_watchpoint_address_matches
6801                      (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6802                     goto fault;
6803                 }
6804                 if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
6805                     goto fault;
6806                 }
6807 
6808                 set_helper_retaddr(retaddr);
6809                 host_fn(vd, reg_off, info.host);
6810                 clear_helper_retaddr();
6811             }
6812             reg_off += esize;
6813         } while (reg_off & 63);
6814     }
6815     return;
6816 
6817  fault:
6818     record_fault(env, reg_off, reg_max);
6819 }
6820 
6821 static inline QEMU_ALWAYS_INLINE
6822 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6823                      target_ulong base, uint32_t desc, uintptr_t retaddr,
6824                      const int esz, const int msz,
6825                      zreg_off_fn *off_fn,
6826                      sve_ldst1_host_fn *host_fn,
6827                      sve_ldst1_tlb_fn *tlb_fn)
6828 {
6829     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6830     /* Remove mtedesc from the normal sve descriptor. */
6831     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6832 
6833     /*
6834      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6835      * offset base entirely over the address space hole to change the
6836      * pointer tag, or change the bit55 selector.  So we could here
6837      * examine TBI + TCMA like we do for sve_ldN_r_mte().
6838      */
6839     sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6840                 esz, msz, off_fn, host_fn, tlb_fn);
6841 }
6842 
6843 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ)                                   \
6844 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6845     (CPUARMState *env, void *vd, void *vg,                              \
6846      void *vm, target_ulong base, uint32_t desc)                        \
6847 {                                                                       \
6848     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ,    \
6849                 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6850 }                                                                       \
6851 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6852     (CPUARMState *env, void *vd, void *vg,                              \
6853      void *vm, target_ulong base, uint32_t desc)                        \
6854 {                                                                       \
6855     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ,   \
6856                     off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6857 }
6858 
6859 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ)                                   \
6860 void HELPER(sve_ldff##MEM##_##OFS)                                      \
6861     (CPUARMState *env, void *vd, void *vg,                              \
6862      void *vm, target_ulong base, uint32_t desc)                        \
6863 {                                                                       \
6864     sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ,    \
6865                 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6866 }                                                                       \
6867 void HELPER(sve_ldff##MEM##_##OFS##_mte)                                \
6868     (CPUARMState *env, void *vd, void *vg,                              \
6869      void *vm, target_ulong base, uint32_t desc)                        \
6870 {                                                                       \
6871     sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ,   \
6872                     off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6873 }
6874 
6875 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6876 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6877 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6878 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6879 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6880 
6881 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6882 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6883 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6884 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6885 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6886 
6887 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6888 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6889 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6890 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6891 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6892 
6893 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6894 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6895 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6896 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6897 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6898 
6899 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6900 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6901 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6902 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6903 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6904 
6905 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6906 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6907 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6908 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6909 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6910 
6911 DO_LDFF1_ZPZ_S(ss_le,  zsu, MO_32)
6912 DO_LDFF1_ZPZ_S(ss_le,  zss, MO_32)
6913 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6914 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6915 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6916 
6917 DO_LDFF1_ZPZ_S(ss_be,  zsu, MO_32)
6918 DO_LDFF1_ZPZ_S(ss_be,  zss, MO_32)
6919 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6920 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6921 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6922 
6923 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6924 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6925 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6926 
6927 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6928 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6929 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6930 
6931 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6932 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6933 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6934 
6935 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6936 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6937 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
6938 
6939 /* Stores with a vector index.  */
6940 
6941 static inline QEMU_ALWAYS_INLINE
6942 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6943                target_ulong base, uint32_t desc, uintptr_t retaddr,
6944                uint32_t mtedesc, int esize, int msize,
6945                zreg_off_fn *off_fn,
6946                sve_ldst1_host_fn *host_fn,
6947                sve_ldst1_tlb_fn *tlb_fn)
6948 {
6949     const int mmu_idx = arm_env_mmu_index(env);
6950     const intptr_t reg_max = simd_oprsz(desc);
6951     const int scale = simd_data(desc);
6952     void *host[ARM_MAX_VQ * 4];
6953     intptr_t reg_off, i;
6954     SVEHostPage info, info2;
6955 
6956     /*
6957      * Probe all of the elements for host addresses and flags.
6958      */
6959     i = reg_off = 0;
6960     do {
6961         uint64_t pg = vg[reg_off >> 6];
6962         do {
6963             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6964             target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6965 
6966             host[i] = NULL;
6967             if (likely((pg >> (reg_off & 63)) & 1)) {
6968                 if (likely(in_page >= msize)) {
6969                     sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6970                                    mmu_idx, retaddr);
6971                     if (!(info.flags & TLB_MMIO)) {
6972                         host[i] = info.host;
6973                     }
6974                 } else {
6975                     /*
6976                      * Element crosses the page boundary.
6977                      * Probe both pages, but do not record the host address,
6978                      * so that we use the slow path.
6979                      */
6980                     sve_probe_page(&info, false, env, addr, 0,
6981                                    MMU_DATA_STORE, mmu_idx, retaddr);
6982                     sve_probe_page(&info2, false, env, addr + in_page, 0,
6983                                    MMU_DATA_STORE, mmu_idx, retaddr);
6984                     info.flags |= info2.flags;
6985                 }
6986 
6987                 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6988                     cpu_check_watchpoint(env_cpu(env), addr, msize,
6989                                          info.attrs, BP_MEM_WRITE, retaddr);
6990                 }
6991 
6992                 if (mtedesc && info.tagged) {
6993                     mte_check(env, mtedesc, addr, retaddr);
6994                 }
6995             }
6996             i += 1;
6997             reg_off += esize;
6998         } while (reg_off & 63);
6999     } while (reg_off < reg_max);
7000 
7001     /*
7002      * Now that we have recognized all exceptions except SyncExternal
7003      * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7004      *
7005      * Note for the common case of an element in RAM, not crossing a page
7006      * boundary, we have stored the host address in host[].  This doubles
7007      * as a first-level check against the predicate, since only enabled
7008      * elements have non-null host addresses.
7009      */
7010     i = reg_off = 0;
7011     do {
7012         void *h = host[i];
7013         if (likely(h != NULL)) {
7014             set_helper_retaddr(retaddr);
7015             host_fn(vd, reg_off, h);
7016             clear_helper_retaddr();
7017         } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
7018             target_ulong addr = base + (off_fn(vm, reg_off) << scale);
7019             tlb_fn(env, vd, reg_off, addr, retaddr);
7020         }
7021         i += 1;
7022         reg_off += esize;
7023     } while (reg_off < reg_max);
7024 }
7025 
7026 static inline QEMU_ALWAYS_INLINE
7027 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
7028                    target_ulong base, uint32_t desc, uintptr_t retaddr,
7029                    int esize, int msize, zreg_off_fn *off_fn,
7030                    sve_ldst1_host_fn *host_fn,
7031                    sve_ldst1_tlb_fn *tlb_fn)
7032 {
7033     uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7034     /* Remove mtedesc from the normal sve descriptor. */
7035     desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
7036 
7037     /*
7038      * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7039      * offset base entirely over the address space hole to change the
7040      * pointer tag, or change the bit55 selector.  So we could here
7041      * examine TBI + TCMA like we do for sve_ldN_r_mte().
7042      */
7043     sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
7044               esize, msize, off_fn, host_fn, tlb_fn);
7045 }
7046 
7047 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ)                                     \
7048 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7049                                  void *vm, target_ulong base, uint32_t desc) \
7050 {                                                                       \
7051     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ,     \
7052               off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7053 }                                                                       \
7054 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7055     void *vm, target_ulong base, uint32_t desc)                         \
7056 {                                                                       \
7057     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ,    \
7058                   off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7059 }
7060 
7061 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ)                                     \
7062 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg,  \
7063                                  void *vm, target_ulong base, uint32_t desc) \
7064 {                                                                       \
7065     sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ,     \
7066               off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb);  \
7067 }                                                                       \
7068 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7069     void *vm, target_ulong base, uint32_t desc)                         \
7070 {                                                                       \
7071     sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ,    \
7072                   off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7073 }
7074 
7075 DO_ST1_ZPZ_S(bs, zsu, MO_8)
7076 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
7077 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
7078 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
7079 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
7080 
7081 DO_ST1_ZPZ_S(bs, zss, MO_8)
7082 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
7083 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
7084 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
7085 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
7086 
7087 DO_ST1_ZPZ_D(bd, zsu, MO_8)
7088 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
7089 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
7090 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
7091 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
7092 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
7093 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
7094 
7095 DO_ST1_ZPZ_D(bd, zss, MO_8)
7096 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
7097 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
7098 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
7099 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
7100 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
7101 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
7102 
7103 DO_ST1_ZPZ_D(bd, zd, MO_8)
7104 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
7105 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
7106 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
7107 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
7108 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
7109 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
7110 
7111 #undef DO_ST1_ZPZ_S
7112 #undef DO_ST1_ZPZ_D
7113 
7114 void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7115 {
7116     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7117     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7118 
7119     for (i = 0; i < opr_sz; ++i) {
7120         d[i] = n[i] ^ m[i] ^ k[i];
7121     }
7122 }
7123 
7124 void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7125 {
7126     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7127     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7128 
7129     for (i = 0; i < opr_sz; ++i) {
7130         d[i] = n[i] ^ (m[i] & ~k[i]);
7131     }
7132 }
7133 
7134 void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7135 {
7136     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7137     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7138 
7139     for (i = 0; i < opr_sz; ++i) {
7140         d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
7141     }
7142 }
7143 
7144 void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7145 {
7146     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7147     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7148 
7149     for (i = 0; i < opr_sz; ++i) {
7150         d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
7151     }
7152 }
7153 
7154 void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
7155 {
7156     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7157     uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
7158 
7159     for (i = 0; i < opr_sz; ++i) {
7160         d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
7161     }
7162 }
7163 
7164 /*
7165  * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7166  * See hasless(v,1) from
7167  *   https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7168  */
7169 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
7170 {
7171     int bits = 8 << esz;
7172     uint64_t ones = dup_const(esz, 1);
7173     uint64_t signs = ones << (bits - 1);
7174     uint64_t cmp0, cmp1;
7175 
7176     cmp1 = dup_const(esz, n);
7177     cmp0 = cmp1 ^ m0;
7178     cmp1 = cmp1 ^ m1;
7179     cmp0 = (cmp0 - ones) & ~cmp0;
7180     cmp1 = (cmp1 - ones) & ~cmp1;
7181     return (cmp0 | cmp1) & signs;
7182 }
7183 
7184 static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7185                                 uint32_t desc, int esz, bool nmatch)
7186 {
7187     uint16_t esz_mask = pred_esz_masks[esz];
7188     intptr_t opr_sz = simd_oprsz(desc);
7189     uint32_t flags = PREDTEST_INIT;
7190     intptr_t i, j, k;
7191 
7192     for (i = 0; i < opr_sz; i += 16) {
7193         uint64_t m0 = *(uint64_t *)(vm + i);
7194         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7195         uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7196         uint16_t out = 0;
7197 
7198         for (j = 0; j < 16; j += 8) {
7199             uint64_t n = *(uint64_t *)(vn + i + j);
7200 
7201             for (k = 0; k < 8; k += 1 << esz) {
7202                 if (pg & (1 << (j + k))) {
7203                     bool o = do_match2(n >> (k * 8), m0, m1, esz);
7204                     out |= (o ^ nmatch) << (j + k);
7205                 }
7206             }
7207         }
7208         *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7209         flags = iter_predtest_fwd(out, pg, flags);
7210     }
7211     return flags;
7212 }
7213 
7214 #define DO_PPZZ_MATCH(NAME, ESZ, INV)                                         \
7215 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)  \
7216 {                                                                             \
7217     return do_match(vd, vn, vm, vg, desc, ESZ, INV);                          \
7218 }
7219 
7220 DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7221 DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7222 
7223 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7224 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7225 
7226 #undef DO_PPZZ_MATCH
7227 
7228 void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
7229                             uint32_t desc)
7230 {
7231     ARMVectorReg scratch;
7232     intptr_t i, j;
7233     intptr_t opr_sz = simd_oprsz(desc);
7234     uint32_t *d = vd, *n = vn, *m = vm;
7235     uint8_t *pg = vg;
7236 
7237     if (d == n) {
7238         n = memcpy(&scratch, n, opr_sz);
7239         if (d == m) {
7240             m = n;
7241         }
7242     } else if (d == m) {
7243         m = memcpy(&scratch, m, opr_sz);
7244     }
7245 
7246     for (i = 0; i < opr_sz; i += 4) {
7247         uint64_t count = 0;
7248         uint8_t pred;
7249 
7250         pred = pg[H1(i >> 3)] >> (i & 7);
7251         if (pred & 1) {
7252             uint32_t nn = n[H4(i >> 2)];
7253 
7254             for (j = 0; j <= i; j += 4) {
7255                 pred = pg[H1(j >> 3)] >> (j & 7);
7256                 if ((pred & 1) && nn == m[H4(j >> 2)]) {
7257                     ++count;
7258                 }
7259             }
7260         }
7261         d[H4(i >> 2)] = count;
7262     }
7263 }
7264 
7265 void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
7266                             uint32_t desc)
7267 {
7268     ARMVectorReg scratch;
7269     intptr_t i, j;
7270     intptr_t opr_sz = simd_oprsz(desc);
7271     uint64_t *d = vd, *n = vn, *m = vm;
7272     uint8_t *pg = vg;
7273 
7274     if (d == n) {
7275         n = memcpy(&scratch, n, opr_sz);
7276         if (d == m) {
7277             m = n;
7278         }
7279     } else if (d == m) {
7280         m = memcpy(&scratch, m, opr_sz);
7281     }
7282 
7283     for (i = 0; i < opr_sz / 8; ++i) {
7284         uint64_t count = 0;
7285         if (pg[H1(i)] & 1) {
7286             uint64_t nn = n[i];
7287             for (j = 0; j <= i; ++j) {
7288                 if ((pg[H1(j)] & 1) && nn == m[j]) {
7289                     ++count;
7290                 }
7291             }
7292         }
7293         d[i] = count;
7294     }
7295 }
7296 
7297 /*
7298  * Returns the number of bytes in m0 and m1 that match n.
7299  * Unlike do_match2 we don't just need true/false, we need an exact count.
7300  * This requires two extra logical operations.
7301  */
7302 static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
7303 {
7304     const uint64_t mask = dup_const(MO_8, 0x7f);
7305     uint64_t cmp0, cmp1;
7306 
7307     cmp1 = dup_const(MO_8, n);
7308     cmp0 = cmp1 ^ m0;
7309     cmp1 = cmp1 ^ m1;
7310 
7311     /*
7312      * 1: clear msb of each byte to avoid carry to next byte (& mask)
7313      * 2: carry in to msb if byte != 0 (+ mask)
7314      * 3: set msb if cmp has msb set (| cmp)
7315      * 4: set ~msb to ignore them (| mask)
7316      * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7317      * 5: invert, resulting in 0x80 if and only if byte == 0.
7318      */
7319     cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
7320     cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
7321 
7322     /*
7323      * Combine the two compares in a way that the bits do
7324      * not overlap, and so preserves the count of set bits.
7325      * If the host has an efficient instruction for ctpop,
7326      * then ctpop(x) + ctpop(y) has the same number of
7327      * operations as ctpop(x | (y >> 1)).  If the host does
7328      * not have an efficient ctpop, then we only want to
7329      * use it once.
7330      */
7331     return ctpop64(cmp0 | (cmp1 >> 1));
7332 }
7333 
7334 void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
7335 {
7336     intptr_t i, j;
7337     intptr_t opr_sz = simd_oprsz(desc);
7338 
7339     for (i = 0; i < opr_sz; i += 16) {
7340         uint64_t n0 = *(uint64_t *)(vn + i);
7341         uint64_t m0 = *(uint64_t *)(vm + i);
7342         uint64_t n1 = *(uint64_t *)(vn + i + 8);
7343         uint64_t m1 = *(uint64_t *)(vm + i + 8);
7344         uint64_t out0 = 0;
7345         uint64_t out1 = 0;
7346 
7347         for (j = 0; j < 64; j += 8) {
7348             uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
7349             uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
7350             out0 |= cnt0 << j;
7351             out1 |= cnt1 << j;
7352         }
7353 
7354         *(uint64_t *)(vd + i) = out0;
7355         *(uint64_t *)(vd + i + 8) = out1;
7356     }
7357 }
7358 
7359 void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
7360 {
7361     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7362     int shr = simd_data(desc);
7363     int shl = 8 - shr;
7364     uint64_t mask = dup_const(MO_8, 0xff >> shr);
7365     uint64_t *d = vd, *n = vn, *m = vm;
7366 
7367     for (i = 0; i < opr_sz; ++i) {
7368         uint64_t t = n[i] ^ m[i];
7369         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7370     }
7371 }
7372 
7373 void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
7374 {
7375     intptr_t i, opr_sz = simd_oprsz(desc) / 8;
7376     int shr = simd_data(desc);
7377     int shl = 16 - shr;
7378     uint64_t mask = dup_const(MO_16, 0xffff >> shr);
7379     uint64_t *d = vd, *n = vn, *m = vm;
7380 
7381     for (i = 0; i < opr_sz; ++i) {
7382         uint64_t t = n[i] ^ m[i];
7383         d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
7384     }
7385 }
7386 
7387 void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
7388 {
7389     intptr_t i, opr_sz = simd_oprsz(desc) / 4;
7390     int shr = simd_data(desc);
7391     uint32_t *d = vd, *n = vn, *m = vm;
7392 
7393     for (i = 0; i < opr_sz; ++i) {
7394         d[i] = ror32(n[i] ^ m[i], shr);
7395     }
7396 }
7397 
7398 void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
7399                      float_status *status, uint32_t desc)
7400 {
7401     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
7402 
7403     for (s = 0; s < opr_sz; ++s) {
7404         float32 *n = vn + s * sizeof(float32) * 4;
7405         float32 *m = vm + s * sizeof(float32) * 4;
7406         float32 *a = va + s * sizeof(float32) * 4;
7407         float32 *d = vd + s * sizeof(float32) * 4;
7408         float32 n00 = n[H4(0)], n01 = n[H4(1)];
7409         float32 n10 = n[H4(2)], n11 = n[H4(3)];
7410         float32 m00 = m[H4(0)], m01 = m[H4(1)];
7411         float32 m10 = m[H4(2)], m11 = m[H4(3)];
7412         float32 p0, p1;
7413 
7414         /* i = 0, j = 0 */
7415         p0 = float32_mul(n00, m00, status);
7416         p1 = float32_mul(n01, m01, status);
7417         d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
7418 
7419         /* i = 0, j = 1 */
7420         p0 = float32_mul(n00, m10, status);
7421         p1 = float32_mul(n01, m11, status);
7422         d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
7423 
7424         /* i = 1, j = 0 */
7425         p0 = float32_mul(n10, m00, status);
7426         p1 = float32_mul(n11, m01, status);
7427         d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
7428 
7429         /* i = 1, j = 1 */
7430         p0 = float32_mul(n10, m10, status);
7431         p1 = float32_mul(n11, m11, status);
7432         d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
7433     }
7434 }
7435 
7436 void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
7437                      float_status *status, uint32_t desc)
7438 {
7439     intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
7440 
7441     for (s = 0; s < opr_sz; ++s) {
7442         float64 *n = vn + s * sizeof(float64) * 4;
7443         float64 *m = vm + s * sizeof(float64) * 4;
7444         float64 *a = va + s * sizeof(float64) * 4;
7445         float64 *d = vd + s * sizeof(float64) * 4;
7446         float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
7447         float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
7448         float64 p0, p1;
7449 
7450         /* i = 0, j = 0 */
7451         p0 = float64_mul(n00, m00, status);
7452         p1 = float64_mul(n01, m01, status);
7453         d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
7454 
7455         /* i = 0, j = 1 */
7456         p0 = float64_mul(n00, m10, status);
7457         p1 = float64_mul(n01, m11, status);
7458         d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
7459 
7460         /* i = 1, j = 0 */
7461         p0 = float64_mul(n10, m00, status);
7462         p1 = float64_mul(n11, m01, status);
7463         d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
7464 
7465         /* i = 1, j = 1 */
7466         p0 = float64_mul(n10, m10, status);
7467         p1 = float64_mul(n11, m11, status);
7468         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
7469     }
7470 }
7471 
7472 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7473 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7474                   float_status *status, uint32_t desc)                        \
7475 {                                                                             \
7476     intptr_t i = simd_oprsz(desc);                                            \
7477     uint64_t *g = vg;                                                         \
7478     do {                                                                      \
7479         uint64_t pg = g[(i - 1) >> 6];                                        \
7480         do {                                                                  \
7481             i -= sizeof(TYPEW);                                               \
7482             if (likely((pg >> (i & 63)) & 1)) {                               \
7483                 TYPEW nn = *(TYPEW *)(vn + HW(i));                            \
7484                 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status);      \
7485             }                                                                 \
7486         } while (i & 63);                                                     \
7487     } while (i != 0);                                                         \
7488 }
7489 
7490 DO_FCVTNT(sve_bfcvtnt,    uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
7491 DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
7492 DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
7493 
7494 #define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP)                             \
7495 void HELPER(NAME)(void *vd, void *vn, void *vg,                               \
7496                   float_status *status, uint32_t desc)                        \
7497 {                                                                             \
7498     intptr_t i = simd_oprsz(desc);                                            \
7499     uint64_t *g = vg;                                                         \
7500     do {                                                                      \
7501         uint64_t pg = g[(i - 1) >> 6];                                        \
7502         do {                                                                  \
7503             i -= sizeof(TYPEW);                                               \
7504             if (likely((pg >> (i & 63)) & 1)) {                               \
7505                 TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN)));            \
7506                 *(TYPEW *)(vd + HW(i)) = OP(nn, status);                      \
7507             }                                                                 \
7508         } while (i & 63);                                                     \
7509     } while (i != 0);                                                         \
7510 }
7511 
7512 DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
7513 DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
7514 
7515 #undef DO_FCVTLT
7516 #undef DO_FCVTNT
7517